From fd9dc1466b8c829da614965b1c460d1de25354bb Mon Sep 17 00:00:00 2001
From: Justin Lebar <justin.lebar@gmail.com>
Date: Mon, 19 Jul 2010 10:34:41 -0700
Subject: [PATCH] Bug 573948 - Part 1: Use libjpeg-turbo instead of libjpeg.
 r=jmuizelaar

---
 config/autoconf.mk.in       |    4 +
 configure.in                |   69 +
 jpeg/MOZCHANGES             |   79 +-
 jpeg/Makefile.in            |  148 ++-
 jpeg/README                 |  264 ++--
 jpeg/README-turbo.txt       |  304 +++++
 jpeg/cderror.h              |    2 +
 jpeg/cdjpeg.c               |  181 ---
 jpeg/cdjpeg.h               |    5 +-
 jpeg/change.log             |  217 ----
 jpeg/cjpeg.c                |  606 ---------
 jpeg/ckconfig.c             |  402 ------
 jpeg/coderules.doc          |  118 --
 jpeg/djpeg.c                |  616 ---------
 jpeg/example.c              |  433 -------
 jpeg/filelist.doc           |  210 ---
 jpeg/install.doc            | 1063 ---------------
 jpeg/jaricom.c              |  152 +++
 jpeg/jcapimin.c             |   14 +-
 jpeg/jcarith.c              |  925 +++++++++++++
 jpeg/jccolor.c              |  152 ++-
 jpeg/jcdctmgr.c             |  485 +++++--
 jpeg/jchuff.c               |    1 -
 jpeg/jcinit.c               |    4 +
 jpeg/jcmarker.c             |   58 +-
 jpeg/jcmaster.c             |   56 +-
 jpeg/jconfig-mac-cw.h       |   43 -
 jpeg/jconfig.doc            |  155 ---
 jpeg/jconfig.h              |  166 +--
 jpeg/jconfig.h.in           |   60 +
 jpeg/jconfig.wat            |   38 -
 jpeg/jcparam.c              |   89 +-
 jpeg/jcphuff.c              |    2 -
 jpeg/jcsample.c             |   12 +-
 jpeg/jdapimin.c             |  137 +-
 jpeg/jdapistd.c             |    4 +-
 jpeg/jdarith.c              |  761 +++++++++++
 jpeg/jdatadst.c             |  163 ++-
 jpeg/jdatasrc.c             |   86 +-
 jpeg/jdcoefct.c             |   29 +-
 jpeg/jdcolor.c              |  269 +---
 jpeg/jdct.h                 |   16 +-
 jpeg/jddctmgr.c             |   77 +-
 jpeg/jdinput.c              |   94 +-
 jpeg/jdmainct.c             |   44 +-
 jpeg/jdmarker.c             |    4 +-
 jpeg/jdmaster.c             |  170 +--
 jpeg/jdmerge.c              |  697 +---------
 jpeg/jdsample.c             |   42 +-
 jpeg/jdtrans.c              |  147 +++
 jpeg/jerror.c               |   24 +-
 jpeg/jerror.h               |    6 +-
 jpeg/jidctfst.c             | 2018 ++++++-----------------------
 jpeg/jidctint.c             |  574 ---------
 jpeg/jmemansi.c             |  167 ---
 jpeg/jmemdos.c              |  638 ---------
 jpeg/jmemdosa.asm           |  379 ------
 jpeg/jmemmgr.c              |  174 +--
 jpeg/jmemname.c             |  276 ----
 jpeg/jmemnobs.c             |    6 +-
 jpeg/jmemsys.h              |    8 +-
 jpeg/jmorecfg.h             |  156 +--
 jpeg/jos2fig.h              |   45 -
 jpeg/jpegcomp.h             |   26 +
 jpeg/jpegint.h              |   11 +-
 jpeg/jpeglib.h              |  161 ++-
 jpeg/jquant1.c              |    6 +-
 jpeg/jquant2.c              |   47 +-
 jpeg/jsimd.h                |   90 ++
 jpeg/jsimd_none.c           |  300 +++++
 jpeg/jsimddct.h             |  102 ++
 jpeg/jutils.c               |    4 +-
 jpeg/jversion.h             |   25 +-
 jpeg/jwinfig.h              |   48 -
 jpeg/libjpeg.doc            | 3006 -------------------------------------------
 jpeg/makefile.gen           |  274 ----
 jpeg/netscape_mods.doc      |   52 -
 jpeg/{ => simd}/Makefile.in |   86 +-
 jpeg/simd/jcclrmmx.asm      |  479 +++++++
 jpeg/simd/jcclrss2-64.asm   |  487 +++++++
 jpeg/simd/jcclrss2.asm      |  505 ++++++++
 jpeg/simd/jccolmmx.asm      |  120 ++
 jpeg/simd/jccolss2-64.asm   |  117 ++
 jpeg/simd/jccolss2.asm      |  117 ++
 jpeg/simd/jcolsamp.inc      |  105 ++
 jpeg/simd/jcqnt3dn.asm      |  233 ++++
 jpeg/simd/jcqntmmx.asm      |  274 ++++
 jpeg/simd/jcqnts2f-64.asm   |  158 +++
 jpeg/simd/jcqnts2f.asm      |  171 +++
 jpeg/simd/jcqnts2i-64.asm   |  187 +++
 jpeg/simd/jcqnts2i.asm      |  200 +++
 jpeg/simd/jcqntsse.asm      |  211 +++
 jpeg/simd/jcsammmx.asm      |  324 +++++
 jpeg/simd/jcsamss2-64.asm   |  330 +++++
 jpeg/simd/jcsamss2.asm      |  351 +++++
 jpeg/simd/jdclrmmx.asm      |  407 ++++++
 jpeg/simd/jdclrss2-64.asm   |  487 +++++++
 jpeg/simd/jdclrss2.asm      |  505 ++++++++
 jpeg/simd/jdcolmmx.asm      |  117 ++
 jpeg/simd/jdcolss2-64.asm   |  117 ++
 jpeg/simd/jdcolss2.asm      |  117 ++
 jpeg/simd/jdct.inc          |   28 +
 jpeg/simd/jdmermmx.asm      |  123 ++
 jpeg/simd/jdmerss2-64.asm   |  123 ++
 jpeg/simd/jdmerss2.asm      |  123 ++
 jpeg/simd/jdmrgmmx.asm      |  466 +++++++
 jpeg/simd/jdmrgss2-64.asm   |  584 +++++++++
 jpeg/simd/jdmrgss2.asm      |  564 ++++++++
 jpeg/simd/jdsammmx.asm      |  737 +++++++++++
 jpeg/simd/jdsamss2-64.asm   |  671 ++++++++++
 jpeg/simd/jdsamss2.asm      |  729 +++++++++++
 jpeg/simd/jf3dnflt.asm      |  320 +++++
 jpeg/simd/jfmmxfst.asm      |  397 ++++++
 jpeg/simd/jfmmxint.asm      |  622 +++++++++
 jpeg/simd/jfss2fst-64.asm   |  392 ++++++
 jpeg/simd/jfss2fst.asm      |  404 ++++++
 jpeg/simd/jfss2int-64.asm   |  622 +++++++++
 jpeg/simd/jfss2int.asm      |  634 +++++++++
 jpeg/simd/jfsseflt-64.asm   |  358 ++++++
 jpeg/simd/jfsseflt.asm      |  370 ++++++
 jpeg/simd/ji3dnflt.asm      |  452 +++++++
 jpeg/simd/jimmxfst.asm      |  500 +++++++
 jpeg/simd/jimmxint.asm      |  852 ++++++++++++
 jpeg/simd/jimmxred.asm      |  706 ++++++++++
 jpeg/simd/jiss2flt-64.asm   |  483 +++++++
 jpeg/simd/jiss2flt.asm      |  498 +++++++
 jpeg/simd/jiss2fst-64.asm   |  492 +++++++
 jpeg/simd/jiss2fst.asm      |  502 ++++++++
 jpeg/simd/jiss2int-64.asm   |  848 ++++++++++++
 jpeg/simd/jiss2int.asm      |  859 +++++++++++++
 jpeg/simd/jiss2red-64.asm   |  576 +++++++++
 jpeg/simd/jiss2red.asm      |  594 +++++++++
 jpeg/simd/jisseflt.asm      |  572 ++++++++
 jpeg/simd/jsimd.h           |  504 ++++++++
 jpeg/simd/jsimd_i386.c      |  957 ++++++++++++++
 jpeg/simd/jsimd_x86_64.c    |  681 ++++++++++
 jpeg/simd/jsimdcfg.inc      |   69 +
 jpeg/simd/jsimdcfg.inc.h    |  168 +++
 jpeg/simd/jsimdcpu.asm      |  105 ++
 jpeg/simd/jsimdext.inc      |  372 ++++++
 jpeg/structure.doc          |  948 --------------
 jpeg/transupp.h             |  210 +++
 jpeg/usage.doc              |  562 --------
 jpeg/wizard.doc             |  211 ---
 144 files changed, 30570 insertions(+), 15119 deletions(-)
 rewrite jpeg/MOZCHANGES (98%)
 create mode 100644 jpeg/README-turbo.txt
 delete mode 100644 jpeg/cdjpeg.c
 delete mode 100644 jpeg/change.log
 delete mode 100644 jpeg/cjpeg.c
 delete mode 100644 jpeg/ckconfig.c
 delete mode 100644 jpeg/coderules.doc
 delete mode 100644 jpeg/djpeg.c
 delete mode 100644 jpeg/example.c
 delete mode 100644 jpeg/filelist.doc
 delete mode 100644 jpeg/install.doc
 create mode 100644 jpeg/jaricom.c
 create mode 100644 jpeg/jcarith.c
 delete mode 100644 jpeg/jconfig-mac-cw.h
 delete mode 100644 jpeg/jconfig.doc
 rewrite jpeg/jconfig.h (99%)
 create mode 100644 jpeg/jconfig.h.in
 delete mode 100644 jpeg/jconfig.wat
 create mode 100644 jpeg/jdarith.c
 create mode 100644 jpeg/jdtrans.c
 rewrite jpeg/jidctfst.c (74%)
 delete mode 100644 jpeg/jmemansi.c
 delete mode 100644 jpeg/jmemdos.c
 delete mode 100644 jpeg/jmemdosa.asm
 delete mode 100644 jpeg/jmemname.c
 delete mode 100644 jpeg/jos2fig.h
 create mode 100644 jpeg/jpegcomp.h
 create mode 100644 jpeg/jsimd.h
 create mode 100644 jpeg/jsimd_none.c
 create mode 100644 jpeg/jsimddct.h
 delete mode 100644 jpeg/jwinfig.h
 delete mode 100644 jpeg/libjpeg.doc
 delete mode 100644 jpeg/makefile.gen
 delete mode 100644 jpeg/netscape_mods.doc
 copy jpeg/{ => simd}/Makefile.in (56%)
 create mode 100644 jpeg/simd/jcclrmmx.asm
 create mode 100644 jpeg/simd/jcclrss2-64.asm
 create mode 100644 jpeg/simd/jcclrss2.asm
 create mode 100644 jpeg/simd/jccolmmx.asm
 create mode 100644 jpeg/simd/jccolss2-64.asm
 create mode 100644 jpeg/simd/jccolss2.asm
 create mode 100644 jpeg/simd/jcolsamp.inc
 create mode 100644 jpeg/simd/jcqnt3dn.asm
 create mode 100644 jpeg/simd/jcqntmmx.asm
 create mode 100644 jpeg/simd/jcqnts2f-64.asm
 create mode 100644 jpeg/simd/jcqnts2f.asm
 create mode 100644 jpeg/simd/jcqnts2i-64.asm
 create mode 100644 jpeg/simd/jcqnts2i.asm
 create mode 100644 jpeg/simd/jcqntsse.asm
 create mode 100644 jpeg/simd/jcsammmx.asm
 create mode 100644 jpeg/simd/jcsamss2-64.asm
 create mode 100644 jpeg/simd/jcsamss2.asm
 create mode 100644 jpeg/simd/jdclrmmx.asm
 create mode 100644 jpeg/simd/jdclrss2-64.asm
 create mode 100644 jpeg/simd/jdclrss2.asm
 create mode 100644 jpeg/simd/jdcolmmx.asm
 create mode 100644 jpeg/simd/jdcolss2-64.asm
 create mode 100644 jpeg/simd/jdcolss2.asm
 create mode 100644 jpeg/simd/jdct.inc
 create mode 100644 jpeg/simd/jdmermmx.asm
 create mode 100644 jpeg/simd/jdmerss2-64.asm
 create mode 100644 jpeg/simd/jdmerss2.asm
 create mode 100644 jpeg/simd/jdmrgmmx.asm
 create mode 100644 jpeg/simd/jdmrgss2-64.asm
 create mode 100644 jpeg/simd/jdmrgss2.asm
 create mode 100644 jpeg/simd/jdsammmx.asm
 create mode 100644 jpeg/simd/jdsamss2-64.asm
 create mode 100644 jpeg/simd/jdsamss2.asm
 create mode 100644 jpeg/simd/jf3dnflt.asm
 create mode 100644 jpeg/simd/jfmmxfst.asm
 create mode 100644 jpeg/simd/jfmmxint.asm
 create mode 100644 jpeg/simd/jfss2fst-64.asm
 create mode 100644 jpeg/simd/jfss2fst.asm
 create mode 100644 jpeg/simd/jfss2int-64.asm
 create mode 100644 jpeg/simd/jfss2int.asm
 create mode 100644 jpeg/simd/jfsseflt-64.asm
 create mode 100644 jpeg/simd/jfsseflt.asm
 create mode 100644 jpeg/simd/ji3dnflt.asm
 create mode 100644 jpeg/simd/jimmxfst.asm
 create mode 100644 jpeg/simd/jimmxint.asm
 create mode 100644 jpeg/simd/jimmxred.asm
 create mode 100644 jpeg/simd/jiss2flt-64.asm
 create mode 100644 jpeg/simd/jiss2flt.asm
 create mode 100644 jpeg/simd/jiss2fst-64.asm
 create mode 100644 jpeg/simd/jiss2fst.asm
 create mode 100644 jpeg/simd/jiss2int-64.asm
 create mode 100644 jpeg/simd/jiss2int.asm
 create mode 100644 jpeg/simd/jiss2red-64.asm
 create mode 100644 jpeg/simd/jiss2red.asm
 create mode 100644 jpeg/simd/jisseflt.asm
 create mode 100644 jpeg/simd/jsimd.h
 create mode 100644 jpeg/simd/jsimd_i386.c
 create mode 100644 jpeg/simd/jsimd_x86_64.c
 create mode 100644 jpeg/simd/jsimdcfg.inc
 create mode 100644 jpeg/simd/jsimdcfg.inc.h
 create mode 100644 jpeg/simd/jsimdcpu.asm
 create mode 100644 jpeg/simd/jsimdext.inc
 delete mode 100644 jpeg/structure.doc
 create mode 100644 jpeg/transupp.h
 delete mode 100644 jpeg/usage.doc
 delete mode 100644 jpeg/wizard.doc

diff --git a/config/autoconf.mk.in b/config/autoconf.mk.in
index 436976463370..0749ac2f3136 100644
--- a/config/autoconf.mk.in
+++ b/config/autoconf.mk.in
@@ -168,6 +168,10 @@ VPX_AS_CONVERSION = @VPX_AS_CONVERSION@
 VPX_ASM_SUFFIX = @VPX_ASM_SUFFIX@
 VPX_X86_ASM = @VPX_X86_ASM@
 VPX_ARM_ASM = @VPX_ARM_ASM@
+LIBJPEG_TURBO_AS = @LIBJPEG_TURBO_AS@
+LIBJPEG_TURBO_ASFLAGS = @LIBJPEG_TURBO_ASFLAGS@
+LIBJPEG_TURBO_X86_ASM = @LIBJPEG_TURBO_X86_ASM@
+LIBJPEG_TURBO_X64_ASM = @LIBJPEG_TURBO_X64_ASM@
 NS_PRINTING = @NS_PRINTING@
 MOZ_PDF_PRINTING = @MOZ_PDF_PRINTING@
 MOZ_CRASHREPORTER = @MOZ_CRASHREPORTER@
diff --git a/configure.in b/configure.in
index 19e70e6e7d5b..05bd8e27836b 100644
--- a/configure.in
+++ b/configure.in
@@ -4971,6 +4971,10 @@ VPX_AS_CONVERSION=
 VPX_ASM_SUFFIX=
 VPX_X86_ASM=
 VPX_ARM_ASM=
+LIBJPEG_TURBO_AS=
+LIBJPEG_TURBO_ASFLAGS=
+LIBJPEG_TURBO_X86_ASM=
+LIBJPEG_TURBO_X64_ASM=
 MOZ_PANGO=1
 MOZ_PERMISSIONS=1
 MOZ_PLACES=1
@@ -6442,6 +6446,67 @@ fi
 AC_DEFINE_UNQUOTED(MOZ_CRASHREPORTER_ENABLE_PERCENT, $MOZ_CRASHREPORTER_ENABLE_PERCENT)
 
 dnl ========================================================
+dnl = libjpeg-turbo configuration
+dnl ========================================================
+
+dnl Detect if we can use yasm to compile libjpeg-turbo's optimized assembly
+dnl files.
+AC_MSG_CHECKING([for YASM assembler])
+AC_CHECK_PROGS(LIBJPEG_TURBO_AS, yasm, "")
+
+dnl XXX jlebar -- need a yasm version check here.
+
+if test -n "LIBJPEG_TURBO_AS"; then
+
+  LIBJPEG_TURBO_AS="yasm"
+
+  dnl We have YASM; see if we support it on this platform.
+  case "$OS_ARCH:$OS_TEST" in
+  Linux:x86|Linux:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f elf32 -rnasm -pnasm -DPIC -DELF"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  Linux:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f elf64 -rnasm -pnasm -D__x86_64__ -DPIC -DELF"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  SunOS:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f elf32 -rnasm -pnasm -DPIC -DELF"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  SunOS:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f elf64 -rnasm -pnasm -D__x86_64__ -DPIC -DELF"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  Darwin:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f macho32 -rnasm -pnasm -DPIC -DMACHO"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  Darwin:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f macho64 -rnasm -pnasm -D__x86_64__ -DPIC -DMACHO"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  WINNT:x86|WINNT:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f win32 -rnasm -pnasm -DPIC -DWIN32"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  WINNT:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f win64 -rnasm -pnasm -D__x86_64__ -DPIC -DWIN64"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  esac
+
+fi # end have YASM
+
+if test -n "$LIBJPEG_TURBO_X86_ASM"; then
+  AC_DEFINE(LIBJPEG_TURBO_X86_ASM)
+elif test -n "$LIBJPEG_TURBO_X64_ASM"; then
+  AC_DEFINE(LIBJPEG_TURBO_X64_ASM)
+else
+  AC_MSG_WARN([No assembler or assembly support for libjpeg-turbo.  Using unoptimized C routines.])
+fi
+
+dnl ========================================================
 dnl = Enable compilation of specific extension modules
 dnl ========================================================
 
@@ -9209,6 +9274,10 @@ AC_SUBST(VPX_AS_CONVERSION)
 AC_SUBST(VPX_ASM_SUFFIX)
 AC_SUBST(VPX_X86_ASM)
 AC_SUBST(VPX_ARM_ASM)
+AC_SUBST(LIBJPEG_TURBO_AS)
+AC_SUBST(LIBJPEG_TURBO_ASFLAGS)
+AC_SUBST(LIBJPEG_TURBO_X86_ASM)
+AC_SUBST(LIBJPEG_TURBO_X64_ASM)
 
 if test "$USING_HCC"; then
    CC='${topsrcdir}/build/hcc'
diff --git a/jpeg/MOZCHANGES b/jpeg/MOZCHANGES
dissimilarity index 98%
index 0030370192bb..78c2752a1b2a 100644
--- a/jpeg/MOZCHANGES
+++ b/jpeg/MOZCHANGES
@@ -1,10 +1,69 @@
-
-Changes made to pristine jpeg source by mozilla.org developers.
-
-2003/08/18 -- change default mapping for METHODDEF, LOCAL, GLOBAL, EXTERN to better match NSPR
-
-2003/03/14  -- mingw bustage fix. w32api uses different header guard define
-               for <basestd.h> than msvc.
-
-????/??/??  -- Lots of undocumented changes. :(
-
+To upgrade to a new revision of libjpeg-turbo, do the following:
+
+* Check out libjpeg-turbo from SVN:
+
+    $ svn co https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo/trunk libjpeg-turbo
+
+* In a clean clone of mozilla-central, run the following commands
+
+    $ rm -rf jpeg
+    $ svn export --ignore-externals /path/to/libjpeg-turbo jpeg
+    $ cd jpeg
+
+* Now look through the new files and rm any which are npotb.  When I upgraded
+  to libjpeg-turbo 1.1.0, the only files I kept which didn't match
+
+    *.c  *.h *.asm *.inc
+
+  were README and README-turbo.
+
+  You can easily look for all non *.c, *.h, *.asm, and *.inc files by running
+
+    $ hg status -nu | grep -v '\(c\|h\|asm\|inc\)$'
+
+  Once you're comfortable that you're only deleting files you want to delete
+  (and you've hg add'ed the files you want to keep), you can nuke the remaining
+  files with
+
+    $ hg status -nu | grep -v '\(c\|h\|asm\|inc\)$' | xargs rm
+
+  A helpful command for finding the *.c files which aren't *currently* part of
+  the build is
+
+    diff <(ls *.c | sort) <(grep -o '\w*\.c' Makefile.in | sort)
+
+  of course, libjpeg-turbo might have added some new source files, so you'll
+  have to look though and figure out which of these files to keep.
+
+* Restore files modified in the Mozilla repository.
+
+    $ hg revert --no-backup Makefile.in jconfig.h jmorecfg.h simd/Makefile.in \
+      simd/jsimdcfg.inc jchuff.c jdhuff.c jdhuff.h MOZCHANGES
+
+* Update Makefile.in to build any new files.
+
+* Finally, tell hg that we've added or removed some files:
+
+    $ hg addremove
+
+
+== March 28, 2011 (initial commit, libjpeg-turbo v1.1.0 r469 2011-02-27) ==
+
+* Modified jmorecfg.h to define UINT8, UINT16, INT16, and INT32 in terms of
+  prtypes to fix a build error on Windows.
+
+* Defined INLINE as NS_ALWAYS_INLINE in jconfig.h.
+
+* Removed the following files which are licensed under the wxWindows license:
+
+    bmp.c, bmp.h, jpegut.c, jpgtest.cxx, rrtimer.h, rrutil.h, turbojpeg.h,
+    turbojpegl.c
+
+* Reverted the following files to what was previously in Mozilla's tree
+  (nominally libjpeg 6.2):
+
+    jchuff.c, jdhuff.c, jdhuff.h
+
+  since the versions of these files in libjpeg-turbo are also under the
+  wxWindows license.  (It would have been nicer to revert them to the new
+  libjpeg-8b code, but that doesn't easily integrate with libjpeg-turbo.)
diff --git a/jpeg/Makefile.in b/jpeg/Makefile.in
index 74ee17785299..d167eb52baab 100644
--- a/jpeg/Makefile.in
+++ b/jpeg/Makefile.in
@@ -15,11 +15,12 @@
 # The Original Code is mozilla.org code.
 #
 # The Initial Developer of the Original Code is
-# Netscape Communications Corporation.
-# Portions created by the Initial Developer are Copyright (C) 1998
+# Mozilla Corporation
+# Portions created by the Initial Developer are Copyright (C) 2010
 # the Initial Developer. All Rights Reserved.
 #
 # Contributor(s):
+#  Justin Lebar <justin.lebar@gmail.com>
 #
 # Alternatively, the contents of this file may be used under the terms of
 # either the GNU General Public License Version 2 or later (the "GPL"), or
@@ -42,6 +43,7 @@ VPATH		= @srcdir@
 
 include $(DEPTH)/config/autoconf.mk
 
+DIRS		= simd
 MODULE		= jpeg
 LIBRARY_NAME	= mozjpeg
 
@@ -58,64 +60,136 @@ endif
 GRE_MODULE	= 1
 
 CSRCS		= \
+		jcomapi.c \
 		jdapimin.c \
 		jdapistd.c \
-		jdatasrc.c \
 		jdatadst.c \
-		jdmaster.c \
+		jdatasrc.c \
+		jdcoefct.c \
+		jdcolor.c \
+		jddctmgr.c \
+		jdhuff.c \
 		jdinput.c \
+		jdmainct.c \
 		jdmarker.c \
-		jdhuff.c \
+		jdmaster.c \
+		jdmerge.c \
 		jdphuff.c \
-		jdmainct.c \
-		jdcoefct.c \
 		jdpostct.c \
-		jddctmgr.c \
-		jidctfst.c \
-		jidctflt.c \
-		jidctint.c \
 		jdsample.c \
-		jdcolor.c \
-		jquant1.c \
-		jquant2.c \
-		jdmerge.c \
-		jcomapi.c \
-		jutils.c \
+		jdtrans.c \
 		jerror.c \
-		jmemmgr.c \
-		jmemnobs.c \
 		jfdctflt.c \
 		jfdctfst.c \
 		jfdctint.c \
-		$(NULL)
-
-EXPORTS		= \
-		jconfig.h \
-		jerror.h \
-		jinclude.h \
-		jmorecfg.h \
-		jpeglib.h \
-		jpegint.h \
-		jwinfig.h \
-		jos2fig.h \
+		jidctflt.c \
+		jidctfst.c \
+		jidctint.c \
+		jidctred.c \
+		jmemmgr.c \
+		jmemnobs.c \
+		jquant1.c \
+		jquant2.c \
+		jutils.c \
 		$(NULL)
 
 # These files enable support for writing JPEGs
 CSRCS		+= \
 		jcapimin.c \
-		jcparam.c \
 		jcapistd.c \
-		jcmarker.c \
+		jccoefct.c \
+		jccolor.c \
+		jcdctmgr.c \
+		jchuff.c \
 		jcinit.c \
 		jcmainct.c \
-		jchuff.c \
-		jcsample.c \
+		jcmarker.c \
 		jcmaster.c \
-		jccoefct.c \
-		jccolor.c \
+		jcparam.c \
 		jcphuff.c \
-		jcdctmgr.c \
 		jcprepct.c \
+		jcsample.c \
+		$(NULL)
+
+AS=$(LIBJPEG_TURBO_AS)
+ASM_SUFFIX=asm
+ASFLAGS=$(LIBJPEG_TURBO_ASFLAGS) -I$(topsrcdir)/modules/libjpeg-turbo/simd/
+
+ifeq ($(AS),yasm)
+  # yasm doesn't like -c
+  AS_DASH_C_FLAG=
+endif
+
+# No SIMD support?
+ifeq (,$(LIBJPEG_TURBO_X86_ASM)$(LIBJPEG_TURBO_X64_ASM))
+  CSRCS += jsimd_none.c
+endif
+
+ifeq (1,$(LIBJPEG_TURBO_X64_ASM))
+  CSRCS   += simd/jsimd_x86_64.c
+  ASFILES += \
+	simd/jccolss2-64.asm \
+	simd/jcqnts2f-64.asm \
+	simd/jcqnts2i-64.asm \
+	simd/jcsamss2-64.asm \
+	simd/jdcolss2-64.asm \
+	simd/jdmerss2-64.asm \
+	simd/jdsamss2-64.asm \
+	simd/jfss2fst-64.asm \
+	simd/jfss2int-64.asm \
+	simd/jfsseflt-64.asm \
+	simd/jiss2flt-64.asm \
+	simd/jiss2fst-64.asm \
+	simd/jiss2int-64.asm \
+	simd/jiss2red-64.asm \
+	$(NULL)
+endif
+
+ifeq (1,$(LIBJPEG_TURBO_X86_ASM))
+  CSRCS   +=simd/jsimd_i386.c
+  ASFILES += \
+	simd/jccolmmx.asm \
+	simd/jccolss2.asm \
+	simd/jcqnt3dn.asm \
+	simd/jcqntmmx.asm \
+	simd/jcqnts2f.asm \
+	simd/jcqnts2i.asm \
+	simd/jcqntsse.asm \
+	simd/jcsammmx.asm \
+	simd/jcsamss2.asm \
+	simd/jdcolmmx.asm \
+	simd/jdcolss2.asm \
+	simd/jdmermmx.asm \
+	simd/jdmerss2.asm \
+	simd/jdsammmx.asm \
+	simd/jdsamss2.asm \
+	simd/jf3dnflt.asm \
+	simd/jfmmxfst.asm \
+	simd/jfmmxint.asm \
+	simd/jfss2fst.asm \
+	simd/jfss2int.asm \
+	simd/jfsseflt.asm \
+	simd/ji3dnflt.asm \
+	simd/jimmxfst.asm \
+	simd/jimmxint.asm \
+	simd/jimmxred.asm \
+	simd/jiss2flt.asm \
+	simd/jiss2fst.asm \
+	simd/jiss2int.asm \
+	simd/jiss2red.asm \
+	simd/jisseflt.asm \
+	simd/jsimdcpu.asm \
+	$(NULL)
+endif
+
+# jwinfig.h, jos2fig.h  ? XXX
+EXPORTS		= \
+		jconfig.h \
+		jerror.h \
+		jinclude.h \
+		jmorecfg.h \
+		jpegint.h \
+		jpeglib.h \
 		$(NULL)
 
 # need static lib for some of the libimg componentry to link properly
diff --git a/jpeg/README b/jpeg/README
index 86cc20669d61..2ead09e64a46 100644
--- a/jpeg/README
+++ b/jpeg/README
@@ -1,22 +1,20 @@
-The Independent JPEG Group's JPEG software
-==========================================
+libjpeg-turbo note:  This file is mostly taken from the libjpeg v8b README
+file, and it is included only for reference.  Some parts of it may not apply to
+libjpeg-turbo.  Please see README-turbo.txt for information specific to the
+turbo version.
 
-README for release 6b of 27-Mar-1998
-====================================
 
-This distribution contains the sixth public release of the Independent JPEG
-Group's free JPEG software.  You are welcome to redistribute this software and
-to use it for any purpose, subject to the conditions under LEGAL ISSUES, below.
+The Independent JPEG Group's JPEG software
+==========================================
 
-Serious users of this software (particularly those incorporating it into
-larger programs) should contact IJG at jpeg-info@uunet.uu.net to be added to
-our electronic mailing list.  Mailing list members are notified of updates
-and have a chance to participate in technical discussions, etc.
+This distribution contains a release of the Independent JPEG Group's free JPEG
+software.  You are welcome to redistribute this software and to use it for any
+purpose, subject to the conditions under LEGAL ISSUES, below.
 
-This software is the work of Tom Lane, Philip Gladstone, Jim Boucher,
-Lee Crocker, Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi,
-Guido Vollbeding, Ge' Weijers, and other members of the Independent JPEG
-Group.
+This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone,
+Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson,
+Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers,
+and other members of the Independent JPEG Group.
 
 IJG is not affiliated with the official ISO JPEG standards committee.
 
@@ -30,27 +28,26 @@ OVERVIEW            General description of JPEG and the IJG software.
 LEGAL ISSUES        Copyright, lack of warranty, terms of distribution.
 REFERENCES          Where to learn more about JPEG.
 ARCHIVE LOCATIONS   Where to find newer versions of this software.
-RELATED SOFTWARE    Other stuff you should get.
 FILE FORMAT WARS    Software *not* to get.
 TO DO               Plans for future IJG releases.
 
 Other documentation files in the distribution are:
 
 User documentation:
-  install.doc       How to configure and install the IJG software.
-  usage.doc         Usage instructions for cjpeg, djpeg, jpegtran,
+  install.txt       How to configure and install the IJG software.
+  usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                     rdjpgcom, and wrjpgcom.
-  *.1               Unix-style man pages for programs (same info as usage.doc).
-  wizard.doc        Advanced usage instructions for JPEG wizards only.
+  *.1               Unix-style man pages for programs (same info as usage.txt).
+  wizard.txt        Advanced usage instructions for JPEG wizards only.
   change.log        Version-to-version change highlights.
 Programmer and internal documentation:
-  libjpeg.doc       How to use the JPEG library in your own programs.
+  libjpeg.txt       How to use the JPEG library in your own programs.
   example.c         Sample code for calling the JPEG library.
-  structure.doc     Overview of the JPEG library's internal structure.
-  filelist.doc      Road map of IJG files.
-  coderules.doc     Coding style rules --- please read if you contribute code.
+  structure.txt     Overview of the JPEG library's internal structure.
+  filelist.txt      Road map of IJG files.
+  coderules.txt     Coding style rules --- please read if you contribute code.
 
-Please read at least the files install.doc and usage.doc.  Useful information
+Please read at least the files install.txt and usage.txt.  Some information
 can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
 ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.
 
@@ -62,24 +59,27 @@ the order listed) before diving into the code.
 OVERVIEW
 ========
 
-This package contains C software to implement JPEG image compression and
-decompression.  JPEG (pronounced "jay-peg") is a standardized compression
-method for full-color and gray-scale images.  JPEG is intended for compressing
-"real-world" scenes; line drawings, cartoons and other non-realistic images
-are not its strong suit.  JPEG is lossy, meaning that the output image is not
-exactly identical to the input image.  Hence you must not use JPEG if you
-have to have identical output bits.  However, on typical photographic images,
-very good compression levels can be obtained with no visible change, and
-remarkably high compression levels are possible if you can tolerate a
-low-quality image.  For more details, see the references, or just experiment
-with various compression settings.
+This package contains C software to implement JPEG image encoding, decoding,
+and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and gray-scale images.  JPEG's strong suit is compressing
+photographic images or other types of images which have smooth color and
+brightness transitions between neighboring pixels.  Images with sharp lines or
+other abrupt features may not compress well with JPEG, and a higher JPEG
+quality may have to be used to avoid visible compression artifacts with such
+images.
+
+JPEG is lossy, meaning that the output pixels are not necessarily identical to
+the input pixels.  However, on photographic content and other "smooth" images,
+very good compression ratios can be obtained with no visible compression
+artifacts, and extremely high compression ratios are possible if you are
+willing to sacrifice image quality (by reducing the "quality" setting in the
+compressor.)
 
 This software implements JPEG baseline, extended-sequential, and progressive
 compression processes.  Provision is made for supporting all variants of these
 processes, although some uncommon parameter settings aren't implemented yet.
-For legal reasons, we are not distributing code for the arithmetic-coding
-variants of JPEG; see LEGAL ISSUES.  We have made no provision for supporting
-the hierarchical or lossless processes defined in the standard.
+We have made no provision for supporting the hierarchical or lossless
+processes defined in the standard.
 
 We provide a set of library routines for reading and writing JPEG image files,
 plus two sample applications "cjpeg" and "djpeg", which use the library to
@@ -91,10 +91,11 @@ considerable functionality beyond the bare JPEG coding/decoding capability;
 for example, the color quantization modules are not strictly part of JPEG
 decoding, but they are essential for output to colormapped file formats or
 colormapped displays.  These extra functions can be compiled out of the
-library if not required for a particular application.  We have also included
-"jpegtran", a utility for lossless transcoding between different JPEG
-processes, and "rdjpgcom" and "wrjpgcom", two simple applications for
-inserting and extracting textual comments in JFIF files.
+library if not required for a particular application.
+
+We have also included "jpegtran", a utility for lossless transcoding between
+different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple
+applications for inserting and extracting textual comments in JFIF files.
 
 The emphasis in designing this software has been on achieving portability and
 flexibility, while also making it fast enough to be useful.  In particular,
@@ -127,7 +128,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-1998, Thomas G. Lane.
+This software is copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -170,17 +171,8 @@ the foregoing paragraphs do.
 The Unix configuration script "configure" was produced with GNU Autoconf.
 It is copyright by the Free Software Foundation but is freely distributable.
 The same holds for its supporting scripts (config.guess, config.sub,
-ltconfig, ltmain.sh).  Another support script, install-sh, is copyright
-by M.I.T. but is also freely distributable.
-
-It appears that the arithmetic coding option of the JPEG spec is covered by
-patents owned by IBM, AT&T, and Mitsubishi.  Hence arithmetic coding cannot
-legally be used without obtaining one or more licenses.  For this reason,
-support for arithmetic coding has been removed from the free JPEG software.
-(Since arithmetic coding provides only a marginal gain over the unpatented
-Huffman mode, it is unlikely that very many implementations will support it.)
-So far as we are aware, there are no patent restrictions on the remaining
-code.
+ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+but is also freely distributable.
 
 The IJG distribution formerly included code to read and write GIF files.
 To avoid entanglement with the Unisys LZW patent, GIF reading support has
@@ -198,7 +190,7 @@ We are required to state that
 REFERENCES
 ==========
 
-We highly recommend reading one or more of these references before trying to
+We recommend reading one or more of these references before trying to
 understand the innards of the JPEG software.
 
 The best short technical introduction to the JPEG compression algorithm is
@@ -207,7 +199,7 @@ The best short technical introduction to the JPEG compression algorithm is
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
 handy, a PostScript file containing a revised version of Wallace's article is
-available at ftp://ftp.uu.net/graphics/jpeg/wallace.ps.gz.  The file (actually
+available at http://www.ijg.org/files/wallace.ps.gz.  The file (actually
 a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
 omits the sample images that appeared in CACM, but it includes corrections
 and some added material.  Note: the Wallace article is copyright ACM and IEEE,
@@ -222,82 +214,53 @@ code but don't know much about data compression in general.  The book's JPEG
 sample code is far from industrial-strength, but when you are ready to look
 at a full implementation, you've got one here...
 
-The best full description of JPEG is the textbook "JPEG Still Image Data
-Compression Standard" by William B. Pennebaker and Joan L. Mitchell, published
-by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.  Price US$59.95, 638 pp.
-The book includes the complete text of the ISO JPEG standards (DIS 10918-1
-and draft DIS 10918-2).  This is by far the most complete exposition of JPEG
-in existence, and we highly recommend it.
-
-The JPEG standard itself is not available electronically; you must order a
-paper copy through ISO or ITU.  (Unless you feel a need to own a certified
-official copy, we recommend buying the Pennebaker and Mitchell book instead;
-it's much cheaper and includes a great deal of useful explanatory material.)
-In the USA, copies of the standard may be ordered from ANSI Sales at (212)
-642-4900, or from Global Engineering Documents at (800) 854-7179.  (ANSI
-doesn't take credit card orders, but Global does.)  It's not cheap: as of
-1992, ANSI was charging $95 for Part 1 and $47 for Part 2, plus 7%
-shipping/handling.  The standard is divided into two parts, Part 1 being the
-actual specification, while Part 2 covers compliance testing methods.  Part 1
-is titled "Digital Compression and Coding of Continuous-tone Still Images,
+The best currently available description of JPEG is the textbook "JPEG Still
+Image Data Compression Standard" by William B. Pennebaker and Joan L.
+Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.
+Price US$59.95, 638 pp.  The book includes the complete text of the ISO JPEG
+standards (DIS 10918-1 and draft DIS 10918-2).
+
+The original JPEG standard is divided into two parts, Part 1 being the actual
+specification, while Part 2 covers compliance testing methods.  Part 1 is
+titled "Digital Compression and Coding of Continuous-tone Still Images,
 Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
 10918-1, ITU-T T.81.  Part 2 is titled "Digital Compression and Coding of
 Continuous-tone Still Images, Part 2: Compliance testing" and has document
 numbers ISO/IEC IS 10918-2, ITU-T T.83.
 
-Some extensions to the original JPEG standard are defined in JPEG Part 3,
-a newer ISO standard numbered ISO/IEC IS 10918-3 and ITU-T T.84.  IJG
-currently does not support any Part 3 extensions.
-
 The JPEG standard does not specify all details of an interchangeable file
 format.  For the omitted details we follow the "JFIF" conventions, revision
-1.02.  A copy of the JFIF spec is available from:
-	Literature Department
-	C-Cube Microsystems, Inc.
-	1778 McCarthy Blvd.
-	Milpitas, CA 95035
-	phone (408) 944-6300,  fax (408) 944-6314
-A PostScript version of this document is available by FTP at
-ftp://ftp.uu.net/graphics/jpeg/jfif.ps.gz.  There is also a plain text
-version at ftp://ftp.uu.net/graphics/jpeg/jfif.txt.gz, but it is missing
-the figures.
+1.02.  JFIF 1.02 has been adopted as an Ecma International Technical Report
+and thus received a formal publication status.  It is available as a free
+download in PDF format from
+http://www.ecma-international.org/publications/techreports/E-TR-098.htm.
+A PostScript version of the JFIF document is available at
+http://www.ijg.org/files/jfif.ps.gz.  There is also a plain text version at
+http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures.
 
 The TIFF 6.0 file format specification can be obtained by FTP from
 ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation scheme
 found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
 IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
 Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
-(Compression tag 7).  Copies of this Note can be obtained from ftp.sgi.com or
-from ftp://ftp.uu.net/graphics/jpeg/.  It is expected that the next revision
+(Compression tag 7).  Copies of this Note can be obtained from
+http://www.ijg.org/files/.  It is expected that the next revision
 of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
 Although IJG's own code does not support TIFF/JPEG, the free libtiff library
-uses our library to implement TIFF/JPEG per the Note.  libtiff is available
-from ftp://ftp.sgi.com/graphics/tiff/.
+uses our library to implement TIFF/JPEG per the Note.
 
 
 ARCHIVE LOCATIONS
 =================
 
-The "official" archive site for this software is ftp.uu.net (Internet
-address 192.48.96.9).  The most recent released version can always be found
-there in directory graphics/jpeg.  This particular version will be archived
-as ftp://ftp.uu.net/graphics/jpeg/jpegsrc.v6b.tar.gz.  If you don't have
-direct Internet access, UUNET's archives are also available via UUCP; contact
-help@uunet.uu.net for information on retrieving files that way.
-
-Numerous Internet sites maintain copies of the UUNET files.  However, only
-ftp.uu.net is guaranteed to have the latest official version.
-
-You can also obtain this software in DOS-compatible "zip" archive format from
-the SimTel archives (ftp://ftp.simtel.net/pub/simtelnet/msdos/graphics/), or
-on CompuServe in the Graphics Support forum (GO CIS:GRAPHSUP), library 12
-"JPEG Tools".  Again, these versions may sometimes lag behind the ftp.uu.net
-release.
-
-The JPEG FAQ (Frequently Asked Questions) article is a useful source of
-general information about JPEG.  It is updated constantly and therefore is
-not included in this distribution.  The FAQ is posted every two weeks to
-Usenet newsgroups comp.graphics.misc, news.answers, and other groups.
+The "official" archive site for this software is www.ijg.org.
+The most recent released version can always be found there in
+directory "files".  This particular version will be archived as
+http://www.ijg.org/files/jpegsrc.v8b.tar.gz, and in Windows-compatible
+"zip" archive format as http://www.ijg.org/files/jpegsr8b.zip.
+
+The JPEG FAQ (Frequently Asked Questions) article is a source of some
+general information about JPEG.
 It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/
 and other news.answers archive sites, including the official news.answers
 archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
@@ -307,79 +270,20 @@ with body
 	send usenet/news.answers/jpeg-faq/part2
 
 
-RELATED SOFTWARE
-================
-
-Numerous viewing and image manipulation programs now support JPEG.  (Quite a
-few of them use this library to do so.)  The JPEG FAQ described above lists
-some of the more popular free and shareware viewers, and tells where to
-obtain them on Internet.
-
-If you are on a Unix machine, we highly recommend Jef Poskanzer's free
-PBMPLUS software, which provides many useful operations on PPM-format image
-files.  In particular, it can convert PPM images to and from a wide range of
-other formats, thus making cjpeg/djpeg considerably more useful.  The latest
-version is distributed by the NetPBM group, and is available from numerous
-sites, notably ftp://wuarchive.wustl.edu/graphics/graphics/packages/NetPBM/.
-Unfortunately PBMPLUS/NETPBM is not nearly as portable as the IJG software is;
-you are likely to have difficulty making it work on any non-Unix machine.
-
-A different free JPEG implementation, written by the PVRG group at Stanford,
-is available from ftp://havefun.stanford.edu/pub/jpeg/.  This program
-is designed for research and experimentation rather than production use;
-it is slower, harder to use, and less portable than the IJG code, but it
-is easier to read and modify.  Also, the PVRG code supports lossless JPEG,
-which we do not.  (On the other hand, it doesn't do progressive JPEG.)
-
-
 FILE FORMAT WARS
 ================
 
-Some JPEG programs produce files that are not compatible with our library.
-The root of the problem is that the ISO JPEG committee failed to specify a
-concrete file format.  Some vendors "filled in the blanks" on their own,
-creating proprietary formats that no one else could read.  (For example, none
-of the early commercial JPEG implementations for the Macintosh were able to
-exchange compressed files.)
-
-The file format we have adopted is called JFIF (see REFERENCES).  This format
-has been agreed to by a number of major commercial JPEG vendors, and it has
-become the de facto standard.  JFIF is a minimal or "low end" representation.
-We recommend the use of TIFF/JPEG (TIFF revision 6.0 as modified by TIFF
-Technical Note #2) for "high end" applications that need to record a lot of
-additional data about an image.  TIFF/JPEG is fairly new and not yet widely
-supported, unfortunately.
-
-The upcoming JPEG Part 3 standard defines a file format called SPIFF.
-SPIFF is interoperable with JFIF, in the sense that most JFIF decoders should
-be able to read the most common variant of SPIFF.  SPIFF has some technical
-advantages over JFIF, but its major claim to fame is simply that it is an
-official standard rather than an informal one.  At this point it is unclear
-whether SPIFF will supersede JFIF or whether JFIF will remain the de-facto
-standard.  IJG intends to support SPIFF once the standard is frozen, but we
-have not decided whether it should become our default output format or not.
-(In any case, our decoder will remain capable of reading JFIF indefinitely.)
-
-Various proprietary file formats incorporating JPEG compression also exist.
-We have little or no sympathy for the existence of these formats.  Indeed,
+The ISO JPEG standards committee actually promotes different formats like
+"JPEG 2000" or "JPEG XR" which are incompatible with original DCT-based
+JPEG.  IJG therefore does not support these formats (see REFERENCES).  Indeed,
 one of the original reasons for developing this free software was to help
-force convergence on common, open format standards for JPEG files.  Don't
-use a proprietary file format!
+force convergence on common, interoperable format standards for JPEG files.
+Don't use an incompatible file format!
+(In any case, our decoder will remain capable of reading existing JPEG
+image files indefinitely.)
 
 
 TO DO
 =====
 
-The major thrust for v7 will probably be improvement of visual quality.
-The current method for scaling the quantization tables is known not to be
-very good at low Q values.  We also intend to investigate block boundary
-smoothing, "poor man's variable quantization", and other means of improving
-quality-vs-file-size performance without sacrificing compatibility.
-
-In future versions, we are considering supporting some of the upcoming JPEG
-Part 3 extensions --- principally, variable quantization and the SPIFF file
-format.
-
-As always, speeding things up is of great interest.
-
-Please send bug reports, offers of help, etc. to jpeg-info@uunet.uu.net.
+Please send bug reports, offers of help, etc. to jpeg-info@uc.ag.
diff --git a/jpeg/README-turbo.txt b/jpeg/README-turbo.txt
new file mode 100644
index 000000000000..fec34e0cdabb
--- /dev/null
+++ b/jpeg/README-turbo.txt
@@ -0,0 +1,304 @@
+*******************************************************************************
+**     Background
+*******************************************************************************
+
+libjpeg-turbo is a derivative of libjpeg which uses SIMD instructions (MMX,
+SSE2, etc.) to accelerate baseline JPEG compression and decompression on x86
+and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-4x as fast
+as the unmodified version of libjpeg, all else being equal.
+
+libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but
+the TigerVNC and VirtualGL projects made numerous enhancements to the codec in
+2009, including improved support for Mac OS X, 64-bit support, support for
+32-bit and big endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
+encoding/decoding, and various bug fixes.  The goal was to produce a fully open
+source codec that could replace the partially closed source TurboJPEG/IPP codec
+used by VirtualGL and TurboVNC.  libjpeg-turbo generally performs in the range
+of 80-120% of TurboJPEG/IPP.  It is faster in some areas but slower in others.
+
+In early 2010, libjpeg-turbo spun off into its own independent project, with
+the goal of making high-speed JPEG compression/decompression technology
+available to a broader range of users and developers.  The libjpeg-turbo shared
+libraries can be used as drop-in replacements for libjpeg on most systems.
+
+
+*******************************************************************************
+**     License
+*******************************************************************************
+
+The TurboJPEG/OSS wrapper, as well as some of the optimizations to the Huffman
+encoder (jchuff.c) and decoder (jdhuff.c), were borrowed from VirtualGL, and
+thus any distribution of libjpeg-turbo which includes those files must, as a
+whole, be subject to the terms of the wxWindows Library Licence, Version 3.1.
+A copy of this license can be found in this directory under LICENSE.txt.  The
+wxWindows Library License is based on the LGPL but includes provisions which
+allow the Library to be statically linked into proprietary libraries and
+applications without requiring the resulting binaries to be distributed under
+the terms of the LGPL.
+
+The rest of the source code, apart from TurboJPEG/OSS and the Huffman codec
+optimizations, falls under a less restrictive, BSD-style license (see README.)
+You can choose to distribute libjpeg-turbo, as a whole, under this BSD-style
+license by simply removing TurboJPEG/OSS and replacing the optimized jchuff.c
+and jdhuff.c with their unoptimized counterparts from the libjpeg v6b source.
+
+
+*******************************************************************************
+**     Using libjpeg-turbo
+*******************************************************************************
+
+=============================
+Replacing libjpeg at Run Time
+=============================
+
+If a Unix application is dynamically linked with libjpeg, then you can replace
+libjpeg with libjpeg-turbo at run time by manipulating LD_LIBRARY_PATH.
+For instance:
+
+  [Using libjpeg]
+  > time cjpeg <vgl_5674_0098.ppm >vgl_5674_0098.jpg
+  real  0m0.392s
+  user  0m0.074s
+  sys   0m0.020s
+
+  [Using libjpeg-turbo]
+  > export LD_LIBRARY_PATH=/opt/libjpeg-turbo/{lib}:$LD_LIBRARY_PATH
+  > time cjpeg <vgl_5674_0098.ppm >vgl_5674_0098.jpg
+  real  0m0.109s
+  user  0m0.029s
+  sys   0m0.010s
+
+NOTE: {lib} can be lib, lib32, lib64, or lib/64, depending on the O/S and
+architecture.
+
+System administrators can also replace the libjpeg sym links in /usr/{lib} with
+links to the libjpeg dynamic library located in /opt/libjpeg-turbo/{lib}.  This
+will effectively accelerate every dynamically linked libjpeg application on the
+system.
+
+The libjpeg-turbo SDK for Visual C++ installs the libjpeg-turbo DLL
+(jpeg62.dll, jpeg7.dll, or jpeg8.dll, depending on whether libjpeg v6b, v7, or
+v8 emulation is enabled) into c:\libjpeg-turbo[64]\bin, and the PATH
+environment variable can be modified such that this directory is searched
+before any others that might contain a libjpeg DLL.  However, if a libjpeg
+DLL exists in an application's install directory, then Windows will load this
+DLL first whenever the application is launched.  Thus, if an application ships
+with jpeg62.dll, jpeg7.dll, or jpeg8.dll, then back up the application's
+version of this DLL and copy c:\libjpeg-turbo[64]\bin\jpeg*.dll into the
+application's install directory to accelerate it.
+
+The version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
+Visual C++ requires the Visual C++ 2008 C run time DLL (msvcr90.dll).
+msvcr90.dll ships with more recent versions of Windows, but users of older
+Windows releases can obtain it from the Visual C++ 2008 Redistributable
+Package, which is available as a free download from Microsoft's web site.
+
+NOTE:  Features of libjpeg which require passing a C run time structure, such
+as a file handle, from an application to libjpeg will probably not work with
+the version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
+Visual C++, unless the application is also built to use the Visual C++ 2008 C
+run time DLL.  In particular, this affects jpeg_stdio_dest() and
+jpeg_stdio_src().
+
+Mac applications typically embed their own copies of the libjpeg dylib inside
+the (hidden) application bundle, so it is not possible to globally replace
+libjpeg on OS X systems.  If an application uses a shared library version of
+libjpeg, then it may be possible to replace the application's version of it.
+This would generally involve copying libjpeg.*.dylib from libjpeg-turbo into
+the appropriate place in the application bundle and using install_name_tool to
+repoint the dylib to the new directory.  This requires an advanced knowledge of
+OS X and would not survive an upgrade or a re-install of the application.
+Thus, it is not recommended for most users.
+
+=======================
+Replacing TurboJPEG/IPP
+=======================
+
+libjpeg-turbo is a drop-in replacement for the TurboJPEG/IPP SDK used by
+VirtualGL 2.1.x and TurboVNC 0.6 (and prior.)  libjpeg-turbo contains a wrapper
+library (TurboJPEG/OSS) that emulates the TurboJPEG API using libjpeg-turbo
+instead of the closed source Intel Performance Primitives.  You can replace the
+TurboJPEG/IPP package on Linux systems with the libjpeg-turbo package in order
+to make existing releases of VirtualGL 2.1.x and TurboVNC 0.x use the new codec
+at run time.  Note that the 64-bit libjpeg-turbo packages contain only 64-bit
+binaries, whereas the TurboJPEG/IPP 64-bit packages contained both 64-bit and
+32-bit binaries.  Thus, to replace a TurboJPEG/IPP 64-bit package, install
+both the 64-bit and 32-bit versions of libjpeg-turbo.
+
+You can also build the VirtualGL 2.1.x and TurboVNC 0.6 source code with
+the libjpeg-turbo SDK instead of TurboJPEG/IPP.  It should work identically.
+libjpeg-turbo also includes static library versions of TurboJPEG/OSS, which
+are used to build TurboVNC 1.0 and later.
+
+========================================
+Using libjpeg-turbo in Your Own Programs
+========================================
+
+For the most part, libjpeg-turbo should work identically to libjpeg, so in
+most cases, an application can be built against libjpeg and then run against
+libjpeg-turbo.  On Unix systems (including Cygwin), you can build against
+libjpeg-turbo instead of libjpeg by setting
+
+  CPATH=/opt/libjpeg-turbo/include
+  and
+  LIBRARY_PATH=/opt/libjpeg-turbo/{lib}
+
+({lib} = lib32 or lib64, depending on whether you are building a 32-bit or a
+64-bit application.)
+
+If using MinGW, then set
+
+  CPATH=/c/libjpeg-turbo-gcc[64]/include
+  and
+  LIBRARY_PATH=/c/libjpeg-turbo-gcc[64]/lib
+
+Building against libjpeg-turbo is useful, for instance, if you want to build an
+application that leverages the libjpeg-turbo colorspace extensions (see below.)
+On Linux and Solaris systems, you would still need to manipulate
+LD_LIBRARY_PATH or create appropriate sym links to use libjpeg-turbo at run
+time.  On such systems, you can pass -R /opt/libjpeg-turbo/{lib} to the linker
+to force the use of libjpeg-turbo at run time rather than libjpeg (also useful
+if you want to leverage the colorspace extensions), or you can link against the
+libjpeg-turbo static library.
+
+To force a Linux, Solaris, or MinGW application to link against the static
+version of libjpeg-turbo, you can use the following linker options:
+
+  -Wl,-Bstatic -ljpeg -Wl,-Bdynamic
+
+On OS X, simply add /opt/libjpeg-turbo/lib/libjpeg.a to the linker command
+line (this also works on Linux and Solaris.)
+
+To build Visual C++ applications using libjpeg-turbo, add
+c:\libjpeg-turbo[64]\include to the system or user INCLUDE environment
+variable and c:\libjpeg-turbo[64]\lib to the system or user LIB environment
+variable, and then link against either jpeg.lib (to use the DLL version of
+libjpeg-turbo) or jpeg-static.lib (to use the static version of libjpeg-turbo.)
+
+=====================
+Colorspace Extensions
+=====================
+
+libjpeg-turbo includes extensions which allow JPEG images to be compressed
+directly from (and decompressed directly to) buffers which use BGR, BGRX,
+RGBX, XBGR, and XRGB pixel ordering.  This is implemented with six new
+colorspace constants:
+
+  JCS_EXT_RGB   /* red/green/blue */
+  JCS_EXT_RGBX  /* red/green/blue/x */
+  JCS_EXT_BGR   /* blue/green/red */
+  JCS_EXT_BGRX  /* blue/green/red/x */
+  JCS_EXT_XBGR  /* x/blue/green/red */
+  JCS_EXT_XRGB  /* x/red/green/blue */
+
+Setting cinfo.in_color_space (compression) or cinfo.out_color_space
+(decompression) to one of these values will cause libjpeg-turbo to read the
+red, green, and blue values from (or write them to) the appropriate position in
+the pixel when YUV conversion is performed.
+
+Your application can check for the existence of these extensions at compile
+time with:
+
+  #ifdef JCS_EXTENSIONS
+
+At run time, attempting to use these extensions with a version of libjpeg
+that doesn't support them will result in a "Bogus input colorspace" error.
+
+=================================
+libjpeg v7 and v8 API/ABI support
+=================================
+
+libjpeg v7 and v8 added new features to the API/ABI, and, unfortunately, the
+compression and decompression structures were extended in a backward-
+incompatible manner to accommodate these features.  Thus, programs which are
+built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
+based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are still not
+as widely used as v6b, enough programs (including a few Linux distros) have
+made the switch that it was desirable to provide support for the libjpeg v7/v8
+API/ABI in libjpeg-turbo.
+
+Some of the libjpeg v7 and v8 features -- DCT scaling, to name one -- involve
+deep modifications to the code which cannot be accommodated by libjpeg-turbo
+without either breaking compatibility with libjpeg v6b or producing an
+unsupportable mess.  In order to fully support libjpeg v8 with all of its
+features, we would have to essentially port the SIMD extensions to the libjpeg
+v8 code base and maintain two separate code trees.  We are hesitant to do this
+until/unless the newer libjpeg code bases garner more community support and
+involvement and until/unless we have some notion of whether future libjpeg
+releases will also be backward-incompatible.
+
+By passing an argument of --with-jpeg7 or --with-jpeg8 to configure, or an
+argument of -DWITH_JPEG7=1 or -DWITH_JPEG8=1 to cmake, you can build a version
+of libjpeg-turbo which emulates the libjpeg v7 or v8 API/ABI, so that programs
+which are built against libjpeg v7 or v8 can be run with libjpeg-turbo.  The
+following section describes which libjpeg v7+ features are supported and which
+aren't.
+
+libjpeg v7 and v8 Features:
+---------------------------
+
+Fully supported:
+
+-- cjpeg: Separate quality settings for luminance and chrominance
+   Note that the libpjeg v7+ API was extended to accommodate this feature only
+   for convenience purposes.  It has always been possible to implement this
+   feature with libjpeg v6b (see rdswitch.c for an example.)
+
+-- cjpeg: 32-bit BMP support
+
+-- jpegtran: lossless cropping
+
+-- jpegtran: -perfect option
+
+-- rdjpgcom: -raw option
+
+-- rdjpgcom: locale awareness
+
+
+Fully supported when using libjpeg v7/v8 emulation:
+
+-- libjpeg: In-memory source and destination managers
+
+
+Not supported:
+
+-- libjpeg: DCT scaling in compressor
+   cinfo.scale_num and cinfo.scale_denom are silently ignored.
+
+-- libjpeg: IDCT scaling extensions in decompressor
+   libjpeg-turbo still supports IDCT scaling with scaling factors of 1/2, 1/4,
+   and 1/8 (same as libjpeg v6b.)
+
+-- libjpeg: Fancy downsampling in compressor
+   cinfo.do_fancy_downsampling is silently ignored.
+
+-- jpegtran: Scaling
+   Seems to depend on the DCT scaling feature, which isn't supported.
+
+
+*******************************************************************************
+**     Performance pitfalls
+*******************************************************************************
+
+===============
+Restart Markers
+===============
+
+The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
+in a way that makes libjpeg happy, so it is necessary to use the slow Huffman
+decoder when decompressing a JPEG image that has restart markers.  This can
+cause the decompression performance to drop by as much as 20%, but the
+performance will still be much much greater than that of libjpeg v6b.  Many
+consumer packages, such as PhotoShop, use restart markers when generating JPEG
+images, so images generated by those programs will experience this issue.
+
+===============================================
+Fast Integer Forward DCT at High Quality Levels
+===============================================
+
+The algorithm used by the SIMD-accelerated quantization function cannot produce
+correct results whenever the fast integer forward DCT is used along with a JPEG
+quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
+function in those cases.  This causes performance to drop by as much as 40%.
+It is therefore strongly advised that you use the slow integer forward DCT
+whenever encoding images with a JPEG quality of 98 or higher.
diff --git a/jpeg/cderror.h b/jpeg/cderror.h
index 70435e161c01..e19c475c5c5c 100644
--- a/jpeg/cderror.h
+++ b/jpeg/cderror.h
@@ -2,6 +2,7 @@
  * cderror.h
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -45,6 +46,7 @@ JMESSAGE(JERR_BMP_BADHEADER, "Invalid BMP file: bad header length")
 JMESSAGE(JERR_BMP_BADPLANES, "Invalid BMP file: biPlanes not equal to 1")
 JMESSAGE(JERR_BMP_COLORSPACE, "BMP output must be grayscale or RGB")
 JMESSAGE(JERR_BMP_COMPRESSED, "Sorry, compressed BMPs not yet supported")
+JMESSAGE(JERR_BMP_EMPTY, "Empty BMP image")
 JMESSAGE(JERR_BMP_NOT, "Not a BMP file - does not start with BM")
 JMESSAGE(JTRC_BMP, "%ux%u 24-bit BMP image")
 JMESSAGE(JTRC_BMP_MAPPED, "%ux%u 8-bit colormapped BMP image")
diff --git a/jpeg/cdjpeg.c b/jpeg/cdjpeg.c
deleted file mode 100644
index b6250ff97cba..000000000000
--- a/jpeg/cdjpeg.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * cdjpeg.c
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains common support routines used by the IJG application
- * programs (cjpeg, djpeg, jpegtran).
- */
-
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include <ctype.h>		/* to declare isupper(), tolower() */
-#ifdef NEED_SIGNAL_CATCHER
-#include <signal.h>		/* to declare signal() */
-#endif
-#ifdef USE_SETMODE
-#include <fcntl.h>		/* to declare setmode()'s parameter macros */
-/* If you have setmode() but not <io.h>, just delete this line: */
-#include <io.h>			/* to declare setmode() */
-#endif
-
-
-/*
- * Signal catcher to ensure that temporary files are removed before aborting.
- * NB: for Amiga Manx C this is actually a global routine named _abort();
- * we put "#define signal_catcher _abort" in jconfig.h.  Talk about bogus...
- */
-
-#ifdef NEED_SIGNAL_CATCHER
-
-static j_common_ptr sig_cinfo;
-
-void				/* must be global for Manx C */
-signal_catcher (int signum)
-{
-  if (sig_cinfo != NULL) {
-    if (sig_cinfo->err != NULL) /* turn off trace output */
-      sig_cinfo->err->trace_level = 0;
-    jpeg_destroy(sig_cinfo);	/* clean up memory allocation & temp files */
-  }
-  exit(EXIT_FAILURE);
-}
-
-
-GLOBAL(void)
-enable_signal_catcher (j_common_ptr cinfo)
-{
-  sig_cinfo = cinfo;
-#ifdef SIGINT			/* not all systems have SIGINT */
-  signal(SIGINT, signal_catcher);
-#endif
-#ifdef SIGTERM			/* not all systems have SIGTERM */
-  signal(SIGTERM, signal_catcher);
-#endif
-}
-
-#endif
-
-
-/*
- * Optional progress monitor: display a percent-done figure on stderr.
- */
-
-#ifdef PROGRESS_REPORT
-
-METHODDEF(void)
-progress_monitor (j_common_ptr cinfo)
-{
-  cd_progress_ptr prog = (cd_progress_ptr) cinfo->progress;
-  int total_passes = prog->pub.total_passes + prog->total_extra_passes;
-  int percent_done = (int) (prog->pub.pass_counter*100L/prog->pub.pass_limit);
-
-  if (percent_done != prog->percent_done) {
-    prog->percent_done = percent_done;
-    if (total_passes > 1) {
-      fprintf(stderr, "\rPass %d/%d: %3d%% ",
-	      prog->pub.completed_passes + prog->completed_extra_passes + 1,
-	      total_passes, percent_done);
-    } else {
-      fprintf(stderr, "\r %3d%% ", percent_done);
-    }
-    fflush(stderr);
-  }
-}
-
-
-GLOBAL(void)
-start_progress_monitor (j_common_ptr cinfo, cd_progress_ptr progress)
-{
-  /* Enable progress display, unless trace output is on */
-  if (cinfo->err->trace_level == 0) {
-    progress->pub.progress_monitor = progress_monitor;
-    progress->completed_extra_passes = 0;
-    progress->total_extra_passes = 0;
-    progress->percent_done = -1;
-    cinfo->progress = &progress->pub;
-  }
-}
-
-
-GLOBAL(void)
-end_progress_monitor (j_common_ptr cinfo)
-{
-  /* Clear away progress display */
-  if (cinfo->err->trace_level == 0) {
-    fprintf(stderr, "\r                \r");
-    fflush(stderr);
-  }
-}
-
-#endif
-
-
-/*
- * Case-insensitive matching of possibly-abbreviated keyword switches.
- * keyword is the constant keyword (must be lower case already),
- * minchars is length of minimum legal abbreviation.
- */
-
-GLOBAL(boolean)
-keymatch (char * arg, const char * keyword, int minchars)
-{
-  register int ca, ck;
-  register int nmatched = 0;
-
-  while ((ca = *arg++) != '\0') {
-    if ((ck = *keyword++) == '\0')
-      return FALSE;		/* arg longer than keyword, no good */
-    if (isupper(ca))		/* force arg to lcase (assume ck is already) */
-      ca = tolower(ca);
-    if (ca != ck)
-      return FALSE;		/* no good */
-    nmatched++;			/* count matched characters */
-  }
-  /* reached end of argument; fail if it's too short for unique abbrev */
-  if (nmatched < minchars)
-    return FALSE;
-  return TRUE;			/* A-OK */
-}
-
-
-/*
- * Routines to establish binary I/O mode for stdin and stdout.
- * Non-Unix systems often require some hacking to get out of text mode.
- */
-
-GLOBAL(FILE *)
-read_stdin (void)
-{
-  FILE * input_file = stdin;
-
-#ifdef USE_SETMODE		/* need to hack file mode? */
-  setmode(fileno(stdin), O_BINARY);
-#endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
-  if ((input_file = fdopen(fileno(stdin), READ_BINARY)) == NULL) {
-    fprintf(stderr, "Cannot reopen stdin\n");
-    exit(EXIT_FAILURE);
-  }
-#endif
-  return input_file;
-}
-
-
-GLOBAL(FILE *)
-write_stdout (void)
-{
-  FILE * output_file = stdout;
-
-#ifdef USE_SETMODE		/* need to hack file mode? */
-  setmode(fileno(stdout), O_BINARY);
-#endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
-  if ((output_file = fdopen(fileno(stdout), WRITE_BINARY)) == NULL) {
-    fprintf(stderr, "Cannot reopen stdout\n");
-    exit(EXIT_FAILURE);
-  }
-#endif
-  return output_file;
-}
diff --git a/jpeg/cdjpeg.h b/jpeg/cdjpeg.h
index 2b387b6e5fa7..ed024ac3ae85 100644
--- a/jpeg/cdjpeg.h
+++ b/jpeg/cdjpeg.h
@@ -104,6 +104,7 @@ typedef struct cdjpeg_progress_mgr * cd_progress_ptr;
 #define jinit_write_targa	jIWrTarga
 #define read_quant_tables	RdQTables
 #define read_scan_script	RdScnScript
+#define set_quality_ratings     SetQRates
 #define set_quant_slots		SetQSlots
 #define set_sample_factors	SetSFacts
 #define read_color_map		RdCMap
@@ -131,8 +132,10 @@ EXTERN(djpeg_dest_ptr) jinit_write_targa JPP((j_decompress_ptr cinfo));
 /* cjpeg support routines (in rdswitch.c) */
 
 EXTERN(boolean) read_quant_tables JPP((j_compress_ptr cinfo, char * filename,
-				    int scale_factor, boolean force_baseline));
+				       boolean force_baseline));
 EXTERN(boolean) read_scan_script JPP((j_compress_ptr cinfo, char * filename));
+EXTERN(boolean) set_quality_ratings JPP((j_compress_ptr cinfo, char *arg,
+					 boolean force_baseline));
 EXTERN(boolean) set_quant_slots JPP((j_compress_ptr cinfo, char *arg));
 EXTERN(boolean) set_sample_factors JPP((j_compress_ptr cinfo, char *arg));
 
diff --git a/jpeg/change.log b/jpeg/change.log
deleted file mode 100644
index 74102c0db5aa..000000000000
--- a/jpeg/change.log
+++ /dev/null
@@ -1,217 +0,0 @@
-CHANGE LOG for Independent JPEG Group's JPEG software
-
-
-Version 6b  27-Mar-1998
------------------------
-
-jpegtran has new features for lossless image transformations (rotation
-and flipping) as well as "lossless" reduction to grayscale.
-
-jpegtran now copies comments by default; it has a -copy switch to enable
-copying all APPn blocks as well, or to suppress comments.  (Formerly it
-always suppressed comments and APPn blocks.)  jpegtran now also preserves
-JFIF version and resolution information.
-
-New decompressor library feature: COM and APPn markers found in the input
-file can be saved in memory for later use by the application.  (Before,
-you had to code this up yourself with a custom marker processor.)
-
-There is an unused field "void * client_data" now in compress and decompress
-parameter structs; this may be useful in some applications.
-
-JFIF version number information is now saved by the decoder and accepted by
-the encoder.  jpegtran uses this to copy the source file's version number,
-to ensure "jpegtran -copy all" won't create bogus files that contain JFXX
-extensions but claim to be version 1.01.  Applications that generate their
-own JFXX extension markers also (finally) have a supported way to cause the
-encoder to emit JFIF version number 1.02.
-
-djpeg's trace mode reports JFIF 1.02 thumbnail images as such, rather
-than as unknown APP0 markers.
-
-In -verbose mode, djpeg and rdjpgcom will try to print the contents of
-APP12 markers as text.  Some digital cameras store useful text information
-in APP12 markers.
-
-Handling of truncated data streams is more robust: blocks beyond the one in
-which the error occurs will be output as uniform gray, or left unchanged
-if decoding a progressive JPEG.  The appearance no longer depends on the
-Huffman tables being used.
-
-Huffman tables are checked for validity much more carefully than before.
-
-To avoid the Unisys LZW patent, djpeg's GIF output capability has been
-changed to produce "uncompressed GIFs", and cjpeg's GIF input capability
-has been removed altogether.  We're not happy about it either, but there
-seems to be no good alternative.
-
-The configure script now supports building libjpeg as a shared library
-on many flavors of Unix (all the ones that GNU libtool knows how to
-build shared libraries for).  Use "./configure --enable-shared" to
-try this out.
-
-New jconfig file and makefiles for Microsoft Visual C++ and Developer Studio.
-Also, a jconfig file and a build script for Metrowerks CodeWarrior
-on Apple Macintosh.  makefile.dj has been updated for DJGPP v2, and there
-are miscellaneous other minor improvements in the makefiles.
-
-jmemmac.c now knows how to create temporary files following Mac System 7
-conventions.
-
-djpeg's -map switch is now able to read raw-format PPM files reliably.
-
-cjpeg -progressive -restart no longer generates any unnecessary DRI markers.
-
-Multiple calls to jpeg_simple_progression for a single JPEG object
-no longer leak memory.
-
-
-Version 6a  7-Feb-96
---------------------
-
-Library initialization sequence modified to detect version mismatches
-and struct field packing mismatches between library and calling application.
-This change requires applications to be recompiled, but does not require
-any application source code change.
-
-All routine declarations changed to the style "GLOBAL(type) name ...",
-that is, GLOBAL, LOCAL, METHODDEF, EXTERN are now macros taking the
-routine's return type as an argument.  This makes it possible to add
-Microsoft-style linkage keywords to all the routines by changing just
-these macros.  Note that any application code that was using these macros
-will have to be changed.
-
-DCT coefficient quantization tables are now stored in normal array order
-rather than zigzag order.  Application code that calls jpeg_add_quant_table,
-or otherwise manipulates quantization tables directly, will need to be
-changed.  If you need to make such code work with either older or newer
-versions of the library, a test like "#if JPEG_LIB_VERSION >= 61" is
-recommended.
-
-djpeg's trace capability now dumps DQT tables in natural order, not zigzag
-order.  This allows the trace output to be made into a "-qtables" file
-more easily.
-
-New system-dependent memory manager module for use on Apple Macintosh.
-
-Fix bug in cjpeg's -smooth option: last one or two scanlines would be
-duplicates of the prior line unless the image height mod 16 was 1 or 2.
-
-Repair minor problems in VMS, BCC, MC6 makefiles.
-
-New configure script based on latest GNU Autoconf.
-
-Correct the list of include files needed by MetroWerks C for ccommand().
-
-Numerous small documentation updates.
-
-
-Version 6  2-Aug-95
--------------------
-
-Progressive JPEG support: library can read and write full progressive JPEG
-files.  A "buffered image" mode supports incremental decoding for on-the-fly
-display of progressive images.  Simply recompiling an existing IJG-v5-based
-decoder with v6 should allow it to read progressive files, though of course
-without any special progressive display.
-
-New "jpegtran" application performs lossless transcoding between different
-JPEG formats; primarily, it can be used to convert baseline to progressive
-JPEG and vice versa.  In support of jpegtran, the library now allows lossless
-reading and writing of JPEG files as DCT coefficient arrays.  This ability
-may be of use in other applications.
-
-Notes for programmers:
-* We changed jpeg_start_decompress() to be able to suspend; this makes all
-decoding modes available to suspending-input applications.  However,
-existing applications that use suspending input will need to be changed
-to check the return value from jpeg_start_decompress().  You don't need to
-do anything if you don't use a suspending data source.
-* We changed the interface to the virtual array routines: access_virt_array
-routines now take a count of the number of rows to access this time.  The
-last parameter to request_virt_array routines is now interpreted as the
-maximum number of rows that may be accessed at once, but not necessarily
-the height of every access.
-
-
-Version 5b  15-Mar-95
----------------------
-
-Correct bugs with grayscale images having v_samp_factor > 1.
-
-jpeg_write_raw_data() now supports output suspension.
-
-Correct bugs in "configure" script for case of compiling in
-a directory other than the one containing the source files.
-
-Repair bug in jquant1.c: sometimes didn't use as many colors as it could.
-
-Borland C makefile and jconfig file work under either MS-DOS or OS/2.
-
-Miscellaneous improvements to documentation.
-
-
-Version 5a  7-Dec-94
---------------------
-
-Changed color conversion roundoff behavior so that grayscale values are
-represented exactly.  (This causes test image files to change.)
-
-Make ordered dither use 16x16 instead of 4x4 pattern for a small quality
-improvement.
-
-New configure script based on latest GNU Autoconf.
-Fix configure script to handle CFLAGS correctly.
-Rename *.auto files to *.cfg, so that configure script still works if
-file names have been truncated for DOS.
-
-Fix bug in rdbmp.c: didn't allow for extra data between header and image.
-
-Modify rdppm.c/wrppm.c to handle 2-byte raw PPM/PGM formats for 12-bit data.
-
-Fix several bugs in rdrle.c.
-
-NEED_SHORT_EXTERNAL_NAMES option was broken.
-
-Revise jerror.h/jerror.c for more flexibility in message table.
-
-Repair oversight in jmemname.c NO_MKTEMP case: file could be there
-but unreadable.
-
-
-Version 5  24-Sep-94
---------------------
-
-Version 5 represents a nearly complete redesign and rewrite of the IJG
-software.  Major user-visible changes include:
-  * Automatic configuration simplifies installation for most Unix systems.
-  * A range of speed vs. image quality tradeoffs are supported.
-    This includes resizing of an image during decompression: scaling down
-    by a factor of 1/2, 1/4, or 1/8 is handled very efficiently.
-  * New programs rdjpgcom and wrjpgcom allow insertion and extraction
-    of text comments in a JPEG file.
-
-The application programmer's interface to the library has changed completely.
-Notable improvements include:
-  * We have eliminated the use of callback routines for handling the
-    uncompressed image data.  The application now sees the library as a
-    set of routines that it calls to read or write image data on a
-    scanline-by-scanline basis.
-  * The application image data is represented in a conventional interleaved-
-    pixel format, rather than as a separate array for each color channel.
-    This can save a copying step in many programs.
-  * The handling of compressed data has been cleaned up: the application can
-    supply routines to source or sink the compressed data.  It is possible to
-    suspend processing on source/sink buffer overrun, although this is not
-    supported in all operating modes.
-  * All static state has been eliminated from the library, so that multiple
-    instances of compression or decompression can be active concurrently.
-  * JPEG abbreviated datastream formats are supported, ie, quantization and
-    Huffman tables can be stored separately from the image data.
-  * And not only that, but the documentation of the library has improved
-    considerably!
-
-
-The last widely used release before the version 5 rewrite was version 4A of
-18-Feb-93.  Change logs before that point have been discarded, since they
-are not of much interest after the rewrite.
diff --git a/jpeg/cjpeg.c b/jpeg/cjpeg.c
deleted file mode 100644
index f2a929f0c9fa..000000000000
--- a/jpeg/cjpeg.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * cjpeg.c
- *
- * Copyright (C) 1991-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains a command-line user interface for the JPEG compressor.
- * It should work on any system with Unix- or MS-DOS-style command lines.
- *
- * Two different command line styles are permitted, depending on the
- * compile-time switch TWO_FILE_COMMANDLINE:
- *	cjpeg [options]  inputfile outputfile
- *	cjpeg [options]  [inputfile]
- * In the second style, output is always to standard output, which you'd
- * normally redirect to a file or pipe to some other program.  Input is
- * either from a named file or from standard input (typically redirected).
- * The second style is convenient on Unix but is unhelpful on systems that
- * don't support pipes.  Also, you MUST use the first style if your system
- * doesn't do binary I/O to stdin/stdout.
- * To simplify script writing, the "-outfile" switch is provided.  The syntax
- *	cjpeg [options]  -outfile outputfile  inputfile
- * works regardless of which command line style is used.
- */
-
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include "jversion.h"		/* for version message */
-
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
-#ifdef __MWERKS__
-#include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
-#endif
-#ifdef THINK_C
-#include <console.h>		/* Think declares it here */
-#endif
-#endif
-
-
-/* Create the add-on message string table. */
-
-#define JMESSAGE(code,string)	string ,
-
-static const char * const cdjpeg_message_table[] = {
-#include "cderror.h"
-  NULL
-};
-
-
-/*
- * This routine determines what format the input file is,
- * and selects the appropriate input-reading module.
- *
- * To determine which family of input formats the file belongs to,
- * we may look only at the first byte of the file, since C does not
- * guarantee that more than one character can be pushed back with ungetc.
- * Looking at additional bytes would require one of these approaches:
- *     1) assume we can fseek() the input file (fails for piped input);
- *     2) assume we can push back more than one character (works in
- *        some C implementations, but unportable);
- *     3) provide our own buffering (breaks input readers that want to use
- *        stdio directly, such as the RLE library);
- * or  4) don't put back the data, and modify the input_init methods to assume
- *        they start reading after the start of file (also breaks RLE library).
- * #1 is attractive for MS-DOS but is untenable on Unix.
- *
- * The most portable solution for file types that can't be identified by their
- * first byte is to make the user tell us what they are.  This is also the
- * only approach for "raw" file types that contain only arbitrary values.
- * We presently apply this method for Targa files.  Most of the time Targa
- * files start with 0x00, so we recognize that case.  Potentially, however,
- * a Targa file could start with any byte value (byte 0 is the length of the
- * seldom-used ID field), so we provide a switch to force Targa input mode.
- */
-
-static boolean is_targa;	/* records user -targa switch */
-
-
-LOCAL(cjpeg_source_ptr)
-select_file_type (j_compress_ptr cinfo, FILE * infile)
-{
-  int c;
-
-  if (is_targa) {
-#ifdef TARGA_SUPPORTED
-    return jinit_read_targa(cinfo);
-#else
-    ERREXIT(cinfo, JERR_TGA_NOTCOMP);
-#endif
-  }
-
-  if ((c = getc(infile)) == EOF)
-    ERREXIT(cinfo, JERR_INPUT_EMPTY);
-  if (ungetc(c, infile) == EOF)
-    ERREXIT(cinfo, JERR_UNGETC_FAILED);
-
-  switch (c) {
-#ifdef BMP_SUPPORTED
-  case 'B':
-    return jinit_read_bmp(cinfo);
-#endif
-#ifdef GIF_SUPPORTED
-  case 'G':
-    return jinit_read_gif(cinfo);
-#endif
-#ifdef PPM_SUPPORTED
-  case 'P':
-    return jinit_read_ppm(cinfo);
-#endif
-#ifdef RLE_SUPPORTED
-  case 'R':
-    return jinit_read_rle(cinfo);
-#endif
-#ifdef TARGA_SUPPORTED
-  case 0x00:
-    return jinit_read_targa(cinfo);
-#endif
-  default:
-    ERREXIT(cinfo, JERR_UNKNOWN_FORMAT);
-    break;
-  }
-
-  return NULL;			/* suppress compiler warnings */
-}
-
-
-/*
- * Argument-parsing code.
- * The switch parser is designed to be useful with DOS-style command line
- * syntax, ie, intermixed switches and file names, where only the switches
- * to the left of a given file name affect processing of that file.
- * The main program in this file doesn't actually use this capability...
- */
-
-
-static const char * progname;	/* program name for error messages */
-static char * outfilename;	/* for -outfile switch */
-
-
-LOCAL(void)
-usage (void)
-/* complain about bad command line */
-{
-  fprintf(stderr, "usage: %s [switches] ", progname);
-#ifdef TWO_FILE_COMMANDLINE
-  fprintf(stderr, "inputfile outputfile\n");
-#else
-  fprintf(stderr, "[inputfile]\n");
-#endif
-
-  fprintf(stderr, "Switches (names may be abbreviated):\n");
-  fprintf(stderr, "  -quality N     Compression quality (0..100; 5-95 is useful range)\n");
-  fprintf(stderr, "  -grayscale     Create monochrome JPEG file\n");
-#ifdef ENTROPY_OPT_SUPPORTED
-  fprintf(stderr, "  -optimize      Optimize Huffman table (smaller file, but slow compression)\n");
-#endif
-#ifdef C_PROGRESSIVE_SUPPORTED
-  fprintf(stderr, "  -progressive   Create progressive JPEG file\n");
-#endif
-#ifdef TARGA_SUPPORTED
-  fprintf(stderr, "  -targa         Input file is Targa format (usually not needed)\n");
-#endif
-  fprintf(stderr, "Switches for advanced users:\n");
-#ifdef DCT_ISLOW_SUPPORTED
-  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
-	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
-#endif
-  fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
-#ifdef INPUT_SMOOTHING_SUPPORTED
-  fprintf(stderr, "  -smooth N      Smooth dithered input (N=1..100 is strength)\n");
-#endif
-  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
-  fprintf(stderr, "  -outfile name  Specify name for output file\n");
-  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
-  fprintf(stderr, "Switches for wizards:\n");
-#ifdef C_ARITH_CODING_SUPPORTED
-  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
-#endif
-  fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
-  fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
-  fprintf(stderr, "  -qslots N[,...]    Set component quantization tables\n");
-  fprintf(stderr, "  -sample HxV[,...]  Set component sampling factors\n");
-#ifdef C_MULTISCAN_FILES_SUPPORTED
-  fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
-#endif
-  exit(EXIT_FAILURE);
-}
-
-
-LOCAL(int)
-parse_switches (j_compress_ptr cinfo, int argc, char **argv,
-		int last_file_arg_seen, boolean for_real)
-/* Parse optional switches.
- * Returns argv[] index of first file-name argument (== argc if none).
- * Any file names with indexes <= last_file_arg_seen are ignored;
- * they have presumably been processed in a previous iteration.
- * (Pass 0 for last_file_arg_seen on the first or only iteration.)
- * for_real is FALSE on the first (dummy) pass; we may skip any expensive
- * processing.
- */
-{
-  int argn;
-  char * arg;
-  int quality;			/* -quality parameter */
-  int q_scale_factor;		/* scaling percentage for -qtables */
-  boolean force_baseline;
-  boolean simple_progressive;
-  char * qtablefile = NULL;	/* saves -qtables filename if any */
-  char * qslotsarg = NULL;	/* saves -qslots parm if any */
-  char * samplearg = NULL;	/* saves -sample parm if any */
-  char * scansarg = NULL;	/* saves -scans parm if any */
-
-  /* Set up default JPEG parameters. */
-  /* Note that default -quality level need not, and does not,
-   * match the default scaling for an explicit -qtables argument.
-   */
-  quality = 75;			/* default -quality value */
-  q_scale_factor = 100;		/* default to no scaling for -qtables */
-  force_baseline = FALSE;	/* by default, allow 16-bit quantizers */
-  simple_progressive = FALSE;
-  is_targa = FALSE;
-  outfilename = NULL;
-  cinfo->err->trace_level = 0;
-
-  /* Scan command line options, adjust parameters */
-
-  for (argn = 1; argn < argc; argn++) {
-    arg = argv[argn];
-    if (*arg != '-') {
-      /* Not a switch, must be a file name argument */
-      if (argn <= last_file_arg_seen) {
-	outfilename = NULL;	/* -outfile applies to just one input file */
-	continue;		/* ignore this name if previously processed */
-      }
-      break;			/* else done parsing switches */
-    }
-    arg++;			/* advance past switch marker character */
-
-    if (keymatch(arg, "arithmetic", 1)) {
-      /* Use arithmetic coding. */
-#ifdef C_ARITH_CODING_SUPPORTED
-      cinfo->arith_code = TRUE;
-#else
-      fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "baseline", 1)) {
-      /* Force baseline-compatible output (8-bit quantizer values). */
-      force_baseline = TRUE;
-
-    } else if (keymatch(arg, "dct", 2)) {
-      /* Select DCT algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (keymatch(argv[argn], "int", 1)) {
-	cinfo->dct_method = JDCT_ISLOW;
-      } else if (keymatch(argv[argn], "fast", 2)) {
-	cinfo->dct_method = JDCT_IFAST;
-      } else if (keymatch(argv[argn], "float", 2)) {
-	cinfo->dct_method = JDCT_FLOAT;
-      } else
-	usage();
-
-    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
-      /* Enable debug printouts. */
-      /* On first -d, print version identification */
-      static boolean printed_version = FALSE;
-
-      if (! printed_version) {
-	fprintf(stderr, "Independent JPEG Group's CJPEG, version %s\n%s\n",
-		JVERSION, JCOPYRIGHT);
-	printed_version = TRUE;
-      }
-      cinfo->err->trace_level++;
-
-    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
-      /* Force a monochrome JPEG file to be generated. */
-      jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
-
-    } else if (keymatch(arg, "maxmemory", 3)) {
-      /* Maximum memory in Kb (or Mb with 'm'). */
-      long lval;
-      char ch = 'x';
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
-      if (ch == 'm' || ch == 'M')
-	lval *= 1000L;
-      cinfo->mem->max_memory_to_use = lval * 1000L;
-
-    } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
-      /* Enable entropy parm optimization. */
-#ifdef ENTROPY_OPT_SUPPORTED
-      cinfo->optimize_coding = TRUE;
-#else
-      fprintf(stderr, "%s: sorry, entropy optimization was not compiled\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "outfile", 4)) {
-      /* Set output file name. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      outfilename = argv[argn];	/* save it away for later use */
-
-    } else if (keymatch(arg, "progressive", 1)) {
-      /* Select simple progressive mode. */
-#ifdef C_PROGRESSIVE_SUPPORTED
-      simple_progressive = TRUE;
-      /* We must postpone execution until num_components is known. */
-#else
-      fprintf(stderr, "%s: sorry, progressive output was not compiled\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "quality", 1)) {
-      /* Quality factor (quantization table scaling factor). */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d", &quality) != 1)
-	usage();
-      /* Change scale factor in case -qtables is present. */
-      q_scale_factor = jpeg_quality_scaling(quality);
-
-    } else if (keymatch(arg, "qslots", 2)) {
-      /* Quantization table slot numbers. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      qslotsarg = argv[argn];
-      /* Must delay setting qslots until after we have processed any
-       * colorspace-determining switches, since jpeg_set_colorspace sets
-       * default quant table numbers.
-       */
-
-    } else if (keymatch(arg, "qtables", 2)) {
-      /* Quantization tables fetched from file. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      qtablefile = argv[argn];
-      /* We postpone actually reading the file in case -quality comes later. */
-
-    } else if (keymatch(arg, "restart", 1)) {
-      /* Restart interval in MCU rows (or in MCUs with 'b'). */
-      long lval;
-      char ch = 'x';
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
-      if (lval < 0 || lval > 65535L)
-	usage();
-      if (ch == 'b' || ch == 'B') {
-	cinfo->restart_interval = (unsigned int) lval;
-	cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
-      } else {
-	cinfo->restart_in_rows = (int) lval;
-	/* restart_interval will be computed during startup */
-      }
-
-    } else if (keymatch(arg, "sample", 2)) {
-      /* Set sampling factors. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      samplearg = argv[argn];
-      /* Must delay setting sample factors until after we have processed any
-       * colorspace-determining switches, since jpeg_set_colorspace sets
-       * default sampling factors.
-       */
-
-    } else if (keymatch(arg, "scans", 2)) {
-      /* Set scan script. */
-#ifdef C_MULTISCAN_FILES_SUPPORTED
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      scansarg = argv[argn];
-      /* We must postpone reading the file in case -progressive appears. */
-#else
-      fprintf(stderr, "%s: sorry, multi-scan output was not compiled\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "smooth", 2)) {
-      /* Set input smoothing factor. */
-      int val;
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d", &val) != 1)
-	usage();
-      if (val < 0 || val > 100)
-	usage();
-      cinfo->smoothing_factor = val;
-
-    } else if (keymatch(arg, "targa", 1)) {
-      /* Input file is Targa format. */
-      is_targa = TRUE;
-
-    } else {
-      usage();			/* bogus switch */
-    }
-  }
-
-  /* Post-switch-scanning cleanup */
-
-  if (for_real) {
-
-    /* Set quantization tables for selected quality. */
-    /* Some or all may be overridden if -qtables is present. */
-    jpeg_set_quality(cinfo, quality, force_baseline);
-
-    if (qtablefile != NULL)	/* process -qtables if it was present */
-      if (! read_quant_tables(cinfo, qtablefile,
-			      q_scale_factor, force_baseline))
-	usage();
-
-    if (qslotsarg != NULL)	/* process -qslots if it was present */
-      if (! set_quant_slots(cinfo, qslotsarg))
-	usage();
-
-    if (samplearg != NULL)	/* process -sample if it was present */
-      if (! set_sample_factors(cinfo, samplearg))
-	usage();
-
-#ifdef C_PROGRESSIVE_SUPPORTED
-    if (simple_progressive)	/* process -progressive; -scans can override */
-      jpeg_simple_progression(cinfo);
-#endif
-
-#ifdef C_MULTISCAN_FILES_SUPPORTED
-    if (scansarg != NULL)	/* process -scans if it was present */
-      if (! read_scan_script(cinfo, scansarg))
-	usage();
-#endif
-  }
-
-  return argn;			/* return index of next arg (file name) */
-}
-
-
-/*
- * The main program.
- */
-
-int
-main (int argc, char **argv)
-{
-  struct jpeg_compress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
-#endif
-  int file_index;
-  cjpeg_source_ptr src_mgr;
-  FILE * input_file;
-  FILE * output_file;
-  JDIMENSION num_scanlines;
-
-  /* On Mac, fetch a command line. */
-#ifdef USE_CCOMMAND
-  argc = ccommand(&argv);
-#endif
-
-  progname = argv[0];
-  if (progname == NULL || progname[0] == 0)
-    progname = "cjpeg";		/* in case C library doesn't provide it */
-
-  /* Initialize the JPEG compression object with default error handling. */
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_compress(&cinfo);
-  /* Add some application-specific error messages (from cderror.h) */
-  jerr.addon_message_table = cdjpeg_message_table;
-  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
-  jerr.last_addon_message = JMSG_LASTADDONCODE;
-
-  /* Now safe to enable signal catcher. */
-#ifdef NEED_SIGNAL_CATCHER
-  enable_signal_catcher((j_common_ptr) &cinfo);
-#endif
-
-  /* Initialize JPEG parameters.
-   * Much of this may be overridden later.
-   * In particular, we don't yet know the input file's color space,
-   * but we need to provide some value for jpeg_set_defaults() to work.
-   */
-
-  cinfo.in_color_space = JCS_RGB; /* arbitrary guess */
-  jpeg_set_defaults(&cinfo);
-
-  /* Scan command line to find file names.
-   * It is convenient to use just one switch-parsing routine, but the switch
-   * values read here are ignored; we will rescan the switches after opening
-   * the input file.
-   */
-
-  file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
-
-#ifdef TWO_FILE_COMMANDLINE
-  /* Must have either -outfile switch or explicit output file name */
-  if (outfilename == NULL) {
-    if (file_index != argc-2) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-    outfilename = argv[file_index+1];
-  } else {
-    if (file_index != argc-1) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-  }
-#else
-  /* Unix style: expect zero or one file name */
-  if (file_index < argc-1) {
-    fprintf(stderr, "%s: only one input file\n", progname);
-    usage();
-  }
-#endif /* TWO_FILE_COMMANDLINE */
-
-  /* Open the input file. */
-  if (file_index < argc) {
-    if ((input_file = fopen(argv[file_index], READ_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, argv[file_index]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default input file is stdin */
-    input_file = read_stdin();
-  }
-
-  /* Open the output file. */
-  if (outfilename != NULL) {
-    if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, outfilename);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default output file is stdout */
-    output_file = write_stdout();
-  }
-
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr) &cinfo, &progress);
-#endif
-
-  /* Figure out the input file format, and set up to read it. */
-  src_mgr = select_file_type(&cinfo, input_file);
-  src_mgr->input_file = input_file;
-
-  /* Read the input file header to obtain file size & colorspace. */
-  (*src_mgr->start_input) (&cinfo, src_mgr);
-
-  /* Now that we know input colorspace, fix colorspace-dependent defaults */
-  jpeg_default_colorspace(&cinfo);
-
-  /* Adjust default compression parameters by re-parsing the options */
-  file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
-
-  /* Specify data destination for compression */
-  jpeg_stdio_dest(&cinfo, output_file);
-
-  /* Start compressor */
-  jpeg_start_compress(&cinfo, TRUE);
-
-  /* Process data */
-  while (cinfo.next_scanline < cinfo.image_height) {
-    num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
-    (void) jpeg_write_scanlines(&cinfo, src_mgr->buffer, num_scanlines);
-  }
-
-  /* Finish compression and release memory */
-  (*src_mgr->finish_input) (&cinfo, src_mgr);
-  jpeg_finish_compress(&cinfo);
-  jpeg_destroy_compress(&cinfo);
-
-  /* Close files, if we opened them */
-  if (input_file != stdin)
-    fclose(input_file);
-  if (output_file != stdout)
-    fclose(output_file);
-
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr) &cinfo);
-#endif
-
-  /* All done. */
-  exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
-}
diff --git a/jpeg/ckconfig.c b/jpeg/ckconfig.c
deleted file mode 100644
index 34baf795b009..000000000000
--- a/jpeg/ckconfig.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * ckconfig.c
- *
- * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- */
-
-/*
- * This program is intended to help you determine how to configure the JPEG
- * software for installation on a particular system.  The idea is to try to
- * compile and execute this program.  If your compiler fails to compile the
- * program, make changes as indicated in the comments below.  Once you can
- * compile the program, run it, and it will produce a "jconfig.h" file for
- * your system.
- *
- * As a general rule, each time you try to compile this program,
- * pay attention only to the *first* error message you get from the compiler.
- * Many C compilers will issue lots of spurious error messages once they
- * have gotten confused.  Go to the line indicated in the first error message,
- * and read the comments preceding that line to see what to change.
- *
- * Almost all of the edits you may need to make to this program consist of
- * changing a line that reads "#define SOME_SYMBOL" to "#undef SOME_SYMBOL",
- * or vice versa.  This is called defining or undefining that symbol.
- */
-
-
-/* First we must see if your system has the include files we need.
- * We start out with the assumption that your system has all the ANSI-standard
- * include files.  If you get any error trying to include one of these files,
- * undefine the corresponding HAVE_xxx symbol.
- */
-
-#define HAVE_STDDEF_H		/* replace 'define' by 'undef' if error here */
-#ifdef HAVE_STDDEF_H		/* next line will be skipped if you undef... */
-#include <stddef.h>
-#endif
-
-#define HAVE_STDLIB_H		/* same thing for stdlib.h */
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-
-#include <stdio.h>		/* If you ain't got this, you ain't got C. */
-
-/* We have to see if your string functions are defined by
- * strings.h (old BSD convention) or string.h (everybody else).
- * We try the non-BSD convention first; define NEED_BSD_STRINGS
- * if the compiler says it can't find string.h.
- */
-
-#undef NEED_BSD_STRINGS
-
-#ifdef NEED_BSD_STRINGS
-#include <strings.h>
-#else
-#include <string.h>
-#endif
-
-/* On some systems (especially older Unix machines), type size_t is
- * defined only in the include file <sys/types.h>.  If you get a failure
- * on the size_t test below, try defining NEED_SYS_TYPES_H.
- */
-
-#undef NEED_SYS_TYPES_H		/* start by assuming we don't need it */
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
-
-/* Usually type size_t is defined in one of the include files we've included
- * above.  If not, you'll get an error on the "typedef size_t my_size_t;" line.
- * In that case, first try defining NEED_SYS_TYPES_H just above.
- * If that doesn't work, you'll have to search through your system library
- * to figure out which include file defines "size_t".  Look for a line that
- * says "typedef something-or-other size_t;".  Then, change the line below
- * that says "#include <someincludefile.h>" to instead include the file
- * you found size_t in, and define NEED_SPECIAL_INCLUDE.  If you can't find
- * type size_t anywhere, try replacing "#include <someincludefile.h>" with
- * "typedef unsigned int size_t;".
- */
-
-#undef NEED_SPECIAL_INCLUDE	/* assume we DON'T need it, for starters */
-
-#ifdef NEED_SPECIAL_INCLUDE
-#include <someincludefile.h>
-#endif
-
-typedef size_t my_size_t;	/* The payoff: do we have size_t now? */
-
-
-/* The next question is whether your compiler supports ANSI-style function
- * prototypes.  You need to know this in order to choose between using
- * makefile.ansi and using makefile.unix.
- * The #define line below is set to assume you have ANSI function prototypes.
- * If you get an error in this group of lines, undefine HAVE_PROTOTYPES.
- */
-
-#define HAVE_PROTOTYPES
-
-#ifdef HAVE_PROTOTYPES
-int testfunction (int arg1, int * arg2); /* check prototypes */
-
-struct methods_struct {		/* check method-pointer declarations */
-  int (*error_exit) (char *msgtext);
-  int (*trace_message) (char *msgtext);
-  int (*another_method) (void);
-};
-
-int testfunction (int arg1, int * arg2) /* check definitions */
-{
-  return arg2[arg1];
-}
-
-int test2function (void)	/* check void arg list */
-{
-  return 0;
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned char" means.
- * If you get an error on the "unsigned char un_char;" line,
- * then undefine HAVE_UNSIGNED_CHAR.
- */
-
-#define HAVE_UNSIGNED_CHAR
-
-#ifdef HAVE_UNSIGNED_CHAR
-unsigned char un_char;
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned short" means.
- * If you get an error on the "unsigned short un_short;" line,
- * then undefine HAVE_UNSIGNED_SHORT.
- */
-
-#define HAVE_UNSIGNED_SHORT
-
-#ifdef HAVE_UNSIGNED_SHORT
-unsigned short un_short;
-#endif
-
-
-/* Now we want to find out if your compiler understands type "void".
- * If you get an error anywhere in here, undefine HAVE_VOID.
- */
-
-#define HAVE_VOID
-
-#ifdef HAVE_VOID
-/* Caution: a C++ compiler will insist on complete prototypes */
-typedef void * void_ptr;	/* check void * */
-#ifdef HAVE_PROTOTYPES		/* check ptr to function returning void */
-typedef void (*void_func) (int a, int b);
-#else
-typedef void (*void_func) ();
-#endif
-
-#ifdef HAVE_PROTOTYPES		/* check void function result */
-void test3function (void_ptr arg1, void_func arg2)
-#else
-void test3function (arg1, arg2)
-     void_ptr arg1;
-     void_func arg2;
-#endif
-{
-  char * locptr = (char *) arg1; /* check casting to and from void * */
-  arg1 = (void *) locptr;
-  (*arg2) (1, 2);		/* check call of fcn returning void */
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "const" means.
- * If you get an error here, undefine HAVE_CONST.
- */
-
-#define HAVE_CONST
-
-#ifdef HAVE_CONST
-static const int carray[3] = {1, 2, 3};
-
-#ifdef HAVE_PROTOTYPES
-int test4function (const int arg1)
-#else
-int test4function (arg1)
-     const int arg1;
-#endif
-{
-  return carray[arg1];
-}
-#endif
-
-
-/* If you get an error or warning about this structure definition,
- * define INCOMPLETE_TYPES_BROKEN.
- */
-
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifndef INCOMPLETE_TYPES_BROKEN
-typedef struct undefined_structure * undef_struct_ptr;
-#endif
-
-
-/* If you get an error about duplicate names,
- * define NEED_SHORT_EXTERNAL_NAMES.
- */
-
-#undef NEED_SHORT_EXTERNAL_NAMES
-
-#ifndef NEED_SHORT_EXTERNAL_NAMES
-
-int possibly_duplicate_function ()
-{
-  return 0;
-}
-
-int possibly_dupli_function ()
-{
-  return 1;
-}
-
-#endif
-
-
-
-/************************************************************************
- *  OK, that's it.  You should not have to change anything beyond this
- *  point in order to compile and execute this program.  (You might get
- *  some warnings, but you can ignore them.)
- *  When you run the program, it will make a couple more tests that it
- *  can do automatically, and then it will create jconfig.h and print out
- *  any additional suggestions it has.
- ************************************************************************
- */
-
-
-#ifdef HAVE_PROTOTYPES
-int is_char_signed (int arg)
-#else
-int is_char_signed (arg)
-     int arg;
-#endif
-{
-  if (arg == 189) {		/* expected result for unsigned char */
-    return 0;			/* type char is unsigned */
-  }
-  else if (arg != -67) {	/* expected result for signed char */
-    printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
-    printf("I fear the JPEG software will not work at all.\n\n");
-  }
-  return 1;			/* assume char is signed otherwise */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int is_shifting_signed (long arg)
-#else
-int is_shifting_signed (arg)
-     long arg;
-#endif
-/* See whether right-shift on a long is signed or not. */
-{
-  long res = arg >> 4;
-
-  if (res == -0x7F7E80CL) {	/* expected result for signed shift */
-    return 1;			/* right shift is signed */
-  }
-  /* see if unsigned-shift hack will fix it. */
-  /* we can't just test exact value since it depends on width of long... */
-  res |= (~0L) << (32-4);
-  if (res == -0x7F7E80CL) {	/* expected result now? */
-    return 0;			/* right shift is unsigned */
-  }
-  printf("Right shift isn't acting as I expect it to.\n");
-  printf("I fear the JPEG software will not work at all.\n\n");
-  return 0;			/* try it with unsigned anyway */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int main (int argc, char ** argv)
-#else
-int main (argc, argv)
-     int argc;
-     char ** argv;
-#endif
-{
-  char signed_char_check = (char) (-67);
-  FILE *outfile;
-
-  /* Attempt to write jconfig.h */
-  if ((outfile = fopen("jconfig.h", "w")) == NULL) {
-    printf("Failed to write jconfig.h\n");
-    return 1;
-  }
-
-  /* Write out all the info */
-  fprintf(outfile, "/* jconfig.h --- generated by ckconfig.c */\n");
-  fprintf(outfile, "/* see jconfig.doc for explanations */\n\n");
-#ifdef HAVE_PROTOTYPES
-  fprintf(outfile, "#define HAVE_PROTOTYPES\n");
-#else
-  fprintf(outfile, "#undef HAVE_PROTOTYPES\n");
-#endif
-#ifdef HAVE_UNSIGNED_CHAR
-  fprintf(outfile, "#define HAVE_UNSIGNED_CHAR\n");
-#else
-  fprintf(outfile, "#undef HAVE_UNSIGNED_CHAR\n");
-#endif
-#ifdef HAVE_UNSIGNED_SHORT
-  fprintf(outfile, "#define HAVE_UNSIGNED_SHORT\n");
-#else
-  fprintf(outfile, "#undef HAVE_UNSIGNED_SHORT\n");
-#endif
-#ifdef HAVE_VOID
-  fprintf(outfile, "/* #define void char */\n");
-#else
-  fprintf(outfile, "#define void char\n");
-#endif
-#ifdef HAVE_CONST
-  fprintf(outfile, "/* #define const */\n");
-#else
-  fprintf(outfile, "#define const\n");
-#endif
-  if (is_char_signed((int) signed_char_check))
-    fprintf(outfile, "#undef CHAR_IS_UNSIGNED\n");
-  else
-    fprintf(outfile, "#define CHAR_IS_UNSIGNED\n");
-#ifdef HAVE_STDDEF_H
-  fprintf(outfile, "#define HAVE_STDDEF_H\n");
-#else
-  fprintf(outfile, "#undef HAVE_STDDEF_H\n");
-#endif
-#ifdef HAVE_STDLIB_H
-  fprintf(outfile, "#define HAVE_STDLIB_H\n");
-#else
-  fprintf(outfile, "#undef HAVE_STDLIB_H\n");
-#endif
-#ifdef NEED_BSD_STRINGS
-  fprintf(outfile, "#define NEED_BSD_STRINGS\n");
-#else
-  fprintf(outfile, "#undef NEED_BSD_STRINGS\n");
-#endif
-#ifdef NEED_SYS_TYPES_H
-  fprintf(outfile, "#define NEED_SYS_TYPES_H\n");
-#else
-  fprintf(outfile, "#undef NEED_SYS_TYPES_H\n");
-#endif
-  fprintf(outfile, "#undef NEED_FAR_POINTERS\n");
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-  fprintf(outfile, "#define NEED_SHORT_EXTERNAL_NAMES\n");
-#else
-  fprintf(outfile, "#undef NEED_SHORT_EXTERNAL_NAMES\n");
-#endif
-#ifdef INCOMPLETE_TYPES_BROKEN
-  fprintf(outfile, "#define INCOMPLETE_TYPES_BROKEN\n");
-#else
-  fprintf(outfile, "#undef INCOMPLETE_TYPES_BROKEN\n");
-#endif
-  fprintf(outfile, "\n#ifdef JPEG_INTERNALS\n\n");
-  if (is_shifting_signed(-0x7F7E80B1L))
-    fprintf(outfile, "#undef RIGHT_SHIFT_IS_UNSIGNED\n");
-  else
-    fprintf(outfile, "#define RIGHT_SHIFT_IS_UNSIGNED\n");
-  fprintf(outfile, "\n#endif /* JPEG_INTERNALS */\n");
-  fprintf(outfile, "\n#ifdef JPEG_CJPEG_DJPEG\n\n");
-  fprintf(outfile, "#define BMP_SUPPORTED		/* BMP image file format */\n");
-  fprintf(outfile, "#define GIF_SUPPORTED		/* GIF image file format */\n");
-  fprintf(outfile, "#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */\n");
-  fprintf(outfile, "#undef RLE_SUPPORTED		/* Utah RLE image file format */\n");
-  fprintf(outfile, "#define TARGA_SUPPORTED		/* Targa image file format */\n\n");
-  fprintf(outfile, "#undef TWO_FILE_COMMANDLINE	/* You may need this on non-Unix systems */\n");
-  fprintf(outfile, "#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */\n");
-  fprintf(outfile, "#undef DONT_USE_B_MODE\n");
-  fprintf(outfile, "/* #define PROGRESS_REPORT */	/* optional */\n");
-  fprintf(outfile, "\n#endif /* JPEG_CJPEG_DJPEG */\n");
-
-  /* Close the jconfig.h file */
-  fclose(outfile);
-
-  /* User report */
-  printf("Configuration check for Independent JPEG Group's software done.\n");
-  printf("\nI have written the jconfig.h file for you.\n\n");
-#ifdef HAVE_PROTOTYPES
-  printf("You should use makefile.ansi as the starting point for your Makefile.\n");
-#else
-  printf("You should use makefile.unix as the starting point for your Makefile.\n");
-#endif
-
-#ifdef NEED_SPECIAL_INCLUDE
-  printf("\nYou'll need to change jconfig.h to include the system include file\n");
-  printf("that you found type size_t in, or add a direct definition of type\n");
-  printf("size_t if that's what you used.  Just add it to the end.\n");
-#endif
-
-  return 0;
-}
diff --git a/jpeg/coderules.doc b/jpeg/coderules.doc
deleted file mode 100644
index 0ab5d9bd302e..000000000000
--- a/jpeg/coderules.doc
+++ /dev/null
@@ -1,118 +0,0 @@
-IJG JPEG LIBRARY:  CODING RULES
-
-Copyright (C) 1991-1996, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-Since numerous people will be contributing code and bug fixes, it's important
-to establish a common coding style.  The goal of using similar coding styles
-is much more important than the details of just what that style is.
-
-In general we follow the recommendations of "Recommended C Style and Coding
-Standards" revision 6.1 (Cannon et al. as modified by Spencer, Keppel and
-Brader).  This document is available in the IJG FTP archive (see
-jpeg/doc/cstyle.ms.tbl.Z, or cstyle.txt.Z for those without nroff/tbl).
-
-Block comments should be laid out thusly:
-
-/*
- *  Block comments in this style.
- */
-
-We indent statements in K&R style, e.g.,
-	if (test) {
-	  then-part;
-	} else {
-	  else-part;
-	}
-with two spaces per indentation level.  (This indentation convention is
-handled automatically by GNU Emacs and many other text editors.)
-
-Multi-word names should be written in lower case with underscores, e.g.,
-multi_word_name (not multiWordName).  Preprocessor symbols and enum constants
-are similar but upper case (MULTI_WORD_NAME).  Names should be unique within
-the first fifteen characters.  (On some older systems, global names must be
-unique within six characters.  We accommodate this without cluttering the
-source code by using macros to substitute shorter names.)
-
-We use function prototypes everywhere; we rely on automatic source code
-transformation to feed prototype-less C compilers.  Transformation is done
-by the simple and portable tool 'ansi2knr.c' (courtesy of Ghostscript).
-ansi2knr is not very bright, so it imposes a format requirement on function
-declarations: the function name MUST BEGIN IN COLUMN 1.  Thus all functions
-should be written in the following style:
-
-LOCAL(int *)
-function_name (int a, char *b)
-{
-    code...
-}
-
-Note that each function definition must begin with GLOBAL(type), LOCAL(type),
-or METHODDEF(type).  These macros expand to "static type" or just "type" as
-appropriate.  They provide a readable indication of the routine's usage and
-can readily be changed for special needs.  (For instance, special linkage
-keywords can be inserted for use in Windows DLLs.)
-
-ansi2knr does not transform method declarations (function pointers in
-structs).  We handle these with a macro JMETHOD, defined as
-	#ifdef HAVE_PROTOTYPES
-	#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
-	#else
-	#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
-	#endif
-which is used like this:
-	struct function_pointers {
-	  JMETHOD(void, init_entropy_encoder, (int somearg, jparms *jp));
-	  JMETHOD(void, term_entropy_encoder, (void));
-	};
-Note the set of parentheses surrounding the parameter list.
-
-A similar solution is used for forward and external function declarations
-(see the EXTERN and JPP macros).
-
-If the code is to work on non-ANSI compilers, we cannot rely on a prototype
-declaration to coerce actual parameters into the right types.  Therefore, use
-explicit casts on actual parameters whenever the actual parameter type is not
-identical to the formal parameter.  Beware of implicit conversions to "int".
-
-It seems there are some non-ANSI compilers in which the sizeof() operator
-is defined to return int, yet size_t is defined as long.  Needless to say,
-this is brain-damaged.  Always use the SIZEOF() macro in place of sizeof(),
-so that the result is guaranteed to be of type size_t.
-
-
-The JPEG library is intended to be used within larger programs.  Furthermore,
-we want it to be reentrant so that it can be used by applications that process
-multiple images concurrently.  The following rules support these requirements:
-
-1. Avoid direct use of file I/O, "malloc", error report printouts, etc;
-pass these through the common routines provided.
-
-2. Minimize global namespace pollution.  Functions should be declared static
-wherever possible.  (Note that our method-based calling conventions help this
-a lot: in many modules only the initialization function will ever need to be
-called directly, so only that function need be externally visible.)  All
-global function names should begin with "jpeg_", and should have an
-abbreviated name (unique in the first six characters) substituted by macro
-when NEED_SHORT_EXTERNAL_NAMES is set.
-
-3. Don't use global variables; anything that must be used in another module
-should be in the common data structures.
-
-4. Don't use static variables except for read-only constant tables.  Variables
-that should be private to a module can be placed into private structures (see
-the system architecture document, structure.doc).
-
-5. Source file names should begin with "j" for files that are part of the
-library proper; source files that are not part of the library, such as cjpeg.c
-and djpeg.c, do not begin with "j".  Keep source file names to eight
-characters (plus ".c" or ".h", etc) to make life easy for MS-DOSers.  Keep
-compression and decompression code in separate source files --- some
-applications may want only one half of the library.
-
-Note: these rules (particularly #4) are not followed religiously in the
-modules that are used in cjpeg/djpeg but are not part of the JPEG library
-proper.  Those modules are not really intended to be used in other
-applications.
diff --git a/jpeg/djpeg.c b/jpeg/djpeg.c
deleted file mode 100644
index e099e90aee35..000000000000
--- a/jpeg/djpeg.c
+++ /dev/null
@@ -1,616 +0,0 @@
-/*
- * djpeg.c
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains a command-line user interface for the JPEG decompressor.
- * It should work on any system with Unix- or MS-DOS-style command lines.
- *
- * Two different command line styles are permitted, depending on the
- * compile-time switch TWO_FILE_COMMANDLINE:
- *	djpeg [options]  inputfile outputfile
- *	djpeg [options]  [inputfile]
- * In the second style, output is always to standard output, which you'd
- * normally redirect to a file or pipe to some other program.  Input is
- * either from a named file or from standard input (typically redirected).
- * The second style is convenient on Unix but is unhelpful on systems that
- * don't support pipes.  Also, you MUST use the first style if your system
- * doesn't do binary I/O to stdin/stdout.
- * To simplify script writing, the "-outfile" switch is provided.  The syntax
- *	djpeg [options]  -outfile outputfile  inputfile
- * works regardless of which command line style is used.
- */
-
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include "jversion.h"		/* for version message */
-
-#include <ctype.h>		/* to declare isprint() */
-
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
-#ifdef __MWERKS__
-#include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
-#endif
-#ifdef THINK_C
-#include <console.h>		/* Think declares it here */
-#endif
-#endif
-
-
-/* Create the add-on message string table. */
-
-#define JMESSAGE(code,string)	string ,
-
-static const char * const cdjpeg_message_table[] = {
-#include "cderror.h"
-  NULL
-};
-
-
-/*
- * This list defines the known output image formats
- * (not all of which need be supported by a given version).
- * You can change the default output format by defining DEFAULT_FMT;
- * indeed, you had better do so if you undefine PPM_SUPPORTED.
- */
-
-typedef enum {
-	FMT_BMP,		/* BMP format (Windows flavor) */
-	FMT_GIF,		/* GIF format */
-	FMT_OS2,		/* BMP format (OS/2 flavor) */
-	FMT_PPM,		/* PPM/PGM (PBMPLUS formats) */
-	FMT_RLE,		/* RLE format */
-	FMT_TARGA,		/* Targa format */
-	FMT_TIFF		/* TIFF format */
-} IMAGE_FORMATS;
-
-#ifndef DEFAULT_FMT		/* so can override from CFLAGS in Makefile */
-#define DEFAULT_FMT	FMT_PPM
-#endif
-
-static IMAGE_FORMATS requested_fmt;
-
-
-/*
- * Argument-parsing code.
- * The switch parser is designed to be useful with DOS-style command line
- * syntax, ie, intermixed switches and file names, where only the switches
- * to the left of a given file name affect processing of that file.
- * The main program in this file doesn't actually use this capability...
- */
-
-
-static const char * progname;	/* program name for error messages */
-static char * outfilename;	/* for -outfile switch */
-
-
-LOCAL(void)
-usage (void)
-/* complain about bad command line */
-{
-  fprintf(stderr, "usage: %s [switches] ", progname);
-#ifdef TWO_FILE_COMMANDLINE
-  fprintf(stderr, "inputfile outputfile\n");
-#else
-  fprintf(stderr, "[inputfile]\n");
-#endif
-
-  fprintf(stderr, "Switches (names may be abbreviated):\n");
-  fprintf(stderr, "  -colors N      Reduce image to no more than N colors\n");
-  fprintf(stderr, "  -fast          Fast, low-quality processing\n");
-  fprintf(stderr, "  -grayscale     Force grayscale output\n");
-#ifdef IDCT_SCALING_SUPPORTED
-  fprintf(stderr, "  -scale M/N     Scale output image by fraction M/N, eg, 1/8\n");
-#endif
-#ifdef BMP_SUPPORTED
-  fprintf(stderr, "  -bmp           Select BMP output format (Windows style)%s\n",
-	  (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
-#endif
-#ifdef GIF_SUPPORTED
-  fprintf(stderr, "  -gif           Select GIF output format%s\n",
-	  (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
-#endif
-#ifdef BMP_SUPPORTED
-  fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
-	  (DEFAULT_FMT == FMT_OS2 ? " (default)" : ""));
-#endif
-#ifdef PPM_SUPPORTED
-  fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
-	  (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
-#endif
-#ifdef RLE_SUPPORTED
-  fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
-	  (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
-#endif
-#ifdef TARGA_SUPPORTED
-  fprintf(stderr, "  -targa         Select Targa output format%s\n",
-	  (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
-#endif
-  fprintf(stderr, "Switches for advanced users:\n");
-#ifdef DCT_ISLOW_SUPPORTED
-  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
-	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
-#endif
-  fprintf(stderr, "  -dither fs     Use F-S dithering (default)\n");
-  fprintf(stderr, "  -dither none   Don't use dithering in quantization\n");
-  fprintf(stderr, "  -dither ordered  Use ordered dither (medium speed, quality)\n");
-#ifdef QUANT_2PASS_SUPPORTED
-  fprintf(stderr, "  -map FILE      Map to colors used in named image file\n");
-#endif
-  fprintf(stderr, "  -nosmooth      Don't use high-quality upsampling\n");
-#ifdef QUANT_1PASS_SUPPORTED
-  fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
-#endif
-  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
-  fprintf(stderr, "  -outfile name  Specify name for output file\n");
-  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
-  exit(EXIT_FAILURE);
-}
-
-
-LOCAL(int)
-parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
-		int last_file_arg_seen, boolean for_real)
-/* Parse optional switches.
- * Returns argv[] index of first file-name argument (== argc if none).
- * Any file names with indexes <= last_file_arg_seen are ignored;
- * they have presumably been processed in a previous iteration.
- * (Pass 0 for last_file_arg_seen on the first or only iteration.)
- * for_real is FALSE on the first (dummy) pass; we may skip any expensive
- * processing.
- */
-{
-  int argn;
-  char * arg;
-
-  /* Set up default JPEG parameters. */
-  requested_fmt = DEFAULT_FMT;	/* set default output file format */
-  outfilename = NULL;
-  cinfo->err->trace_level = 0;
-
-  /* Scan command line options, adjust parameters */
-
-  for (argn = 1; argn < argc; argn++) {
-    arg = argv[argn];
-    if (*arg != '-') {
-      /* Not a switch, must be a file name argument */
-      if (argn <= last_file_arg_seen) {
-	outfilename = NULL;	/* -outfile applies to just one input file */
-	continue;		/* ignore this name if previously processed */
-      }
-      break;			/* else done parsing switches */
-    }
-    arg++;			/* advance past switch marker character */
-
-    if (keymatch(arg, "bmp", 1)) {
-      /* BMP output format. */
-      requested_fmt = FMT_BMP;
-
-    } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
-	       keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) {
-      /* Do color quantization. */
-      int val;
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d", &val) != 1)
-	usage();
-      cinfo->desired_number_of_colors = val;
-      cinfo->quantize_colors = TRUE;
-
-    } else if (keymatch(arg, "dct", 2)) {
-      /* Select IDCT algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (keymatch(argv[argn], "int", 1)) {
-	cinfo->dct_method = JDCT_ISLOW;
-      } else if (keymatch(argv[argn], "fast", 2)) {
-	cinfo->dct_method = JDCT_IFAST;
-      } else if (keymatch(argv[argn], "float", 2)) {
-	cinfo->dct_method = JDCT_FLOAT;
-      } else
-	usage();
-
-    } else if (keymatch(arg, "dither", 2)) {
-      /* Select dithering algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (keymatch(argv[argn], "fs", 2)) {
-	cinfo->dither_mode = JDITHER_FS;
-      } else if (keymatch(argv[argn], "none", 2)) {
-	cinfo->dither_mode = JDITHER_NONE;
-      } else if (keymatch(argv[argn], "ordered", 2)) {
-	cinfo->dither_mode = JDITHER_ORDERED;
-      } else
-	usage();
-
-    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
-      /* Enable debug printouts. */
-      /* On first -d, print version identification */
-      static boolean printed_version = FALSE;
-
-      if (! printed_version) {
-	fprintf(stderr, "Independent JPEG Group's DJPEG, version %s\n%s\n",
-		JVERSION, JCOPYRIGHT);
-	printed_version = TRUE;
-      }
-      cinfo->err->trace_level++;
-
-    } else if (keymatch(arg, "fast", 1)) {
-      /* Select recommended processing options for quick-and-dirty output. */
-      cinfo->two_pass_quantize = FALSE;
-      cinfo->dither_mode = JDITHER_ORDERED;
-      if (! cinfo->quantize_colors) /* don't override an earlier -colors */
-	cinfo->desired_number_of_colors = 216;
-      cinfo->dct_method = JDCT_FASTEST;
-      cinfo->do_fancy_upsampling = FALSE;
-
-    } else if (keymatch(arg, "gif", 1)) {
-      /* GIF output format. */
-      requested_fmt = FMT_GIF;
-
-    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
-      /* Force monochrome output. */
-      cinfo->out_color_space = JCS_GRAYSCALE;
-
-    } else if (keymatch(arg, "map", 3)) {
-      /* Quantize to a color map taken from an input file. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (for_real) {		/* too expensive to do twice! */
-#ifdef QUANT_2PASS_SUPPORTED	/* otherwise can't quantize to supplied map */
-	FILE * mapfile;
-
-	if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
-	  fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
-	  exit(EXIT_FAILURE);
-	}
-	read_color_map(cinfo, mapfile);
-	fclose(mapfile);
-	cinfo->quantize_colors = TRUE;
-#else
-	ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-      }
-
-    } else if (keymatch(arg, "maxmemory", 3)) {
-      /* Maximum memory in Kb (or Mb with 'm'). */
-      long lval;
-      char ch = 'x';
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
-      if (ch == 'm' || ch == 'M')
-	lval *= 1000L;
-      cinfo->mem->max_memory_to_use = lval * 1000L;
-
-    } else if (keymatch(arg, "nosmooth", 3)) {
-      /* Suppress fancy upsampling */
-      cinfo->do_fancy_upsampling = FALSE;
-
-    } else if (keymatch(arg, "onepass", 3)) {
-      /* Use fast one-pass quantization. */
-      cinfo->two_pass_quantize = FALSE;
-
-    } else if (keymatch(arg, "os2", 3)) {
-      /* BMP output format (OS/2 flavor). */
-      requested_fmt = FMT_OS2;
-
-    } else if (keymatch(arg, "outfile", 4)) {
-      /* Set output file name. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      outfilename = argv[argn];	/* save it away for later use */
-
-    } else if (keymatch(arg, "pnm", 1) || keymatch(arg, "ppm", 1)) {
-      /* PPM/PGM output format. */
-      requested_fmt = FMT_PPM;
-
-    } else if (keymatch(arg, "rle", 1)) {
-      /* RLE output format. */
-      requested_fmt = FMT_RLE;
-
-    } else if (keymatch(arg, "scale", 1)) {
-      /* Scale the output image by a fraction M/N. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d/%d",
-		 &cinfo->scale_num, &cinfo->scale_denom) != 2)
-	usage();
-
-    } else if (keymatch(arg, "targa", 1)) {
-      /* Targa output format. */
-      requested_fmt = FMT_TARGA;
-
-    } else {
-      usage();			/* bogus switch */
-    }
-  }
-
-  return argn;			/* return index of next arg (file name) */
-}
-
-
-/*
- * Marker processor for COM and interesting APPn markers.
- * This replaces the library's built-in processor, which just skips the marker.
- * We want to print out the marker as text, to the extent possible.
- * Note this code relies on a non-suspending data source.
- */
-
-LOCAL(unsigned int)
-jpeg_getc (j_decompress_ptr cinfo)
-/* Read next byte */
-{
-  struct jpeg_source_mgr * datasrc = cinfo->src;
-
-  if (datasrc->bytes_in_buffer == 0) {
-    if (! (*datasrc->fill_input_buffer) (cinfo))
-      ERREXIT(cinfo, JERR_CANT_SUSPEND);
-  }
-  datasrc->bytes_in_buffer--;
-  return GETJOCTET(*datasrc->next_input_byte++);
-}
-
-
-METHODDEF(boolean)
-print_text_marker (j_decompress_ptr cinfo)
-{
-  boolean traceit = (cinfo->err->trace_level >= 1);
-  INT32 length;
-  unsigned int ch;
-  unsigned int lastch = 0;
-
-  length = jpeg_getc(cinfo) << 8;
-  length += jpeg_getc(cinfo);
-  length -= 2;			/* discount the length word itself */
-
-  if (traceit) {
-    if (cinfo->unread_marker == JPEG_COM)
-      fprintf(stderr, "Comment, length %ld:\n", (long) length);
-    else			/* assume it is an APPn otherwise */
-      fprintf(stderr, "APP%d, length %ld:\n",
-	      cinfo->unread_marker - JPEG_APP0, (long) length);
-  }
-
-  while (--length >= 0) {
-    ch = jpeg_getc(cinfo);
-    if (traceit) {
-      /* Emit the character in a readable form.
-       * Nonprintables are converted to \nnn form,
-       * while \ is converted to \\.
-       * Newlines in CR, CR/LF, or LF form will be printed as one newline.
-       */
-      if (ch == '\r') {
-	fprintf(stderr, "\n");
-      } else if (ch == '\n') {
-	if (lastch != '\r')
-	  fprintf(stderr, "\n");
-      } else if (ch == '\\') {
-	fprintf(stderr, "\\\\");
-      } else if (isprint(ch)) {
-	putc(ch, stderr);
-      } else {
-	fprintf(stderr, "\\%03o", ch);
-      }
-      lastch = ch;
-    }
-  }
-
-  if (traceit)
-    fprintf(stderr, "\n");
-
-  return TRUE;
-}
-
-
-/*
- * The main program.
- */
-
-int
-main (int argc, char **argv)
-{
-  struct jpeg_decompress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
-#endif
-  int file_index;
-  djpeg_dest_ptr dest_mgr = NULL;
-  FILE * input_file;
-  FILE * output_file;
-  JDIMENSION num_scanlines;
-
-  /* On Mac, fetch a command line. */
-#ifdef USE_CCOMMAND
-  argc = ccommand(&argv);
-#endif
-
-  progname = argv[0];
-  if (progname == NULL || progname[0] == 0)
-    progname = "djpeg";		/* in case C library doesn't provide it */
-
-  /* Initialize the JPEG decompression object with default error handling. */
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_decompress(&cinfo);
-  /* Add some application-specific error messages (from cderror.h) */
-  jerr.addon_message_table = cdjpeg_message_table;
-  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
-  jerr.last_addon_message = JMSG_LASTADDONCODE;
-
-  /* Insert custom marker processor for COM and APP12.
-   * APP12 is used by some digital camera makers for textual info,
-   * so we provide the ability to display it as text.
-   * If you like, additional APPn marker types can be selected for display,
-   * but don't try to override APP0 or APP14 this way (see libjpeg.doc).
-   */
-  jpeg_set_marker_processor(&cinfo, JPEG_COM, print_text_marker);
-  jpeg_set_marker_processor(&cinfo, JPEG_APP0+12, print_text_marker);
-
-  /* Now safe to enable signal catcher. */
-#ifdef NEED_SIGNAL_CATCHER
-  enable_signal_catcher((j_common_ptr) &cinfo);
-#endif
-
-  /* Scan command line to find file names. */
-  /* It is convenient to use just one switch-parsing routine, but the switch
-   * values read here are ignored; we will rescan the switches after opening
-   * the input file.
-   * (Exception: tracing level set here controls verbosity for COM markers
-   * found during jpeg_read_header...)
-   */
-
-  file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
-
-#ifdef TWO_FILE_COMMANDLINE
-  /* Must have either -outfile switch or explicit output file name */
-  if (outfilename == NULL) {
-    if (file_index != argc-2) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-    outfilename = argv[file_index+1];
-  } else {
-    if (file_index != argc-1) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-  }
-#else
-  /* Unix style: expect zero or one file name */
-  if (file_index < argc-1) {
-    fprintf(stderr, "%s: only one input file\n", progname);
-    usage();
-  }
-#endif /* TWO_FILE_COMMANDLINE */
-
-  /* Open the input file. */
-  if (file_index < argc) {
-    if ((input_file = fopen(argv[file_index], READ_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, argv[file_index]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default input file is stdin */
-    input_file = read_stdin();
-  }
-
-  /* Open the output file. */
-  if (outfilename != NULL) {
-    if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, outfilename);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default output file is stdout */
-    output_file = write_stdout();
-  }
-
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr) &cinfo, &progress);
-#endif
-
-  /* Specify data source for decompression */
-  jpeg_stdio_src(&cinfo, input_file);
-
-  /* Read file header, set default decompression parameters */
-  (void) jpeg_read_header(&cinfo, TRUE);
-
-  /* Adjust default decompression parameters by re-parsing the options */
-  file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
-
-  /* Initialize the output module now to let it override any crucial
-   * option settings (for instance, GIF wants to force color quantization).
-   */
-  switch (requested_fmt) {
-#ifdef BMP_SUPPORTED
-  case FMT_BMP:
-    dest_mgr = jinit_write_bmp(&cinfo, FALSE);
-    break;
-  case FMT_OS2:
-    dest_mgr = jinit_write_bmp(&cinfo, TRUE);
-    break;
-#endif
-#ifdef GIF_SUPPORTED
-  case FMT_GIF:
-    dest_mgr = jinit_write_gif(&cinfo);
-    break;
-#endif
-#ifdef PPM_SUPPORTED
-  case FMT_PPM:
-    dest_mgr = jinit_write_ppm(&cinfo);
-    break;
-#endif
-#ifdef RLE_SUPPORTED
-  case FMT_RLE:
-    dest_mgr = jinit_write_rle(&cinfo);
-    break;
-#endif
-#ifdef TARGA_SUPPORTED
-  case FMT_TARGA:
-    dest_mgr = jinit_write_targa(&cinfo);
-    break;
-#endif
-  default:
-    ERREXIT(&cinfo, JERR_UNSUPPORTED_FORMAT);
-    break;
-  }
-  dest_mgr->output_file = output_file;
-
-  /* Start decompressor */
-  (void) jpeg_start_decompress(&cinfo);
-
-  /* Write output file header */
-  (*dest_mgr->start_output) (&cinfo, dest_mgr);
-
-  /* Process data */
-  while (cinfo.output_scanline < cinfo.output_height) {
-    num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
-					dest_mgr->buffer_height);
-    (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
-  }
-
-#ifdef PROGRESS_REPORT
-  /* Hack: count final pass as done in case finish_output does an extra pass.
-   * The library won't have updated completed_passes.
-   */
-  progress.pub.completed_passes = progress.pub.total_passes;
-#endif
-
-  /* Finish decompression and release memory.
-   * I must do it in this order because output module has allocated memory
-   * of lifespan JPOOL_IMAGE; it needs to finish before releasing memory.
-   */
-  (*dest_mgr->finish_output) (&cinfo, dest_mgr);
-  (void) jpeg_finish_decompress(&cinfo);
-  jpeg_destroy_decompress(&cinfo);
-
-  /* Close files, if we opened them */
-  if (input_file != stdin)
-    fclose(input_file);
-  if (output_file != stdout)
-    fclose(output_file);
-
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr) &cinfo);
-#endif
-
-  /* All done. */
-  exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
-}
diff --git a/jpeg/example.c b/jpeg/example.c
deleted file mode 100644
index 7fc354f04d9d..000000000000
--- a/jpeg/example.c
+++ /dev/null
@@ -1,433 +0,0 @@
-/*
- * example.c
- *
- * This file illustrates how to use the IJG code as a subroutine library
- * to read or write JPEG image files.  You should look at this code in
- * conjunction with the documentation file libjpeg.doc.
- *
- * This code will not do anything useful as-is, but it may be helpful as a
- * skeleton for constructing routines that call the JPEG library.  
- *
- * We present these routines in the same coding style used in the JPEG code
- * (ANSI function definitions, etc); but you are of course free to code your
- * routines in a different style if you prefer.
- */
-
-#include <stdio.h>
-
-/*
- * Include file for users of JPEG library.
- * You will need to have included system headers that define at least
- * the typedefs FILE and size_t before you can include jpeglib.h.
- * (stdio.h is sufficient on ANSI-conforming systems.)
- * You may also wish to include "jerror.h".
- */
-
-#include "jpeglib.h"
-
-/*
- * <setjmp.h> is used for the optional error recovery mechanism shown in
- * the second part of the example.
- */
-
-#include <setjmp.h>
-
-
-
-/******************** JPEG COMPRESSION SAMPLE INTERFACE *******************/
-
-/* This half of the example shows how to feed data into the JPEG compressor.
- * We present a minimal version that does not worry about refinements such
- * as error recovery (the JPEG code will just exit() if it gets an error).
- */
-
-
-/*
- * IMAGE DATA FORMATS:
- *
- * The standard input image format is a rectangular array of pixels, with
- * each pixel having the same number of "component" values (color channels).
- * Each pixel row is an array of JSAMPLEs (which typically are unsigned chars).
- * If you are working with color data, then the color values for each pixel
- * must be adjacent in the row; for example, R,G,B,R,G,B,R,G,B,... for 24-bit
- * RGB color.
- *
- * For this example, we'll assume that this data structure matches the way
- * our application has stored the image in memory, so we can just pass a
- * pointer to our image buffer.  In particular, let's say that the image is
- * RGB color and is described by:
- */
-
-extern JSAMPLE * image_buffer;	/* Points to large array of R,G,B-order data */
-extern int image_height;	/* Number of rows in image */
-extern int image_width;		/* Number of columns in image */
-
-
-/*
- * Sample routine for JPEG compression.  We assume that the target file name
- * and a compression quality factor are passed in.
- */
-
-GLOBAL(void)
-write_JPEG_file (char * filename, int quality)
-{
-  /* This struct contains the JPEG compression parameters and pointers to
-   * working space (which is allocated as needed by the JPEG library).
-   * It is possible to have several such structures, representing multiple
-   * compression/decompression processes, in existence at once.  We refer
-   * to any one struct (and its associated working data) as a "JPEG object".
-   */
-  struct jpeg_compress_struct cinfo;
-  /* This struct represents a JPEG error handler.  It is declared separately
-   * because applications often want to supply a specialized error handler
-   * (see the second half of this file for an example).  But here we just
-   * take the easy way out and use the standard error handler, which will
-   * print a message on stderr and call exit() if compression fails.
-   * Note that this struct must live as long as the main JPEG parameter
-   * struct, to avoid dangling-pointer problems.
-   */
-  struct jpeg_error_mgr jerr;
-  /* More stuff */
-  FILE * outfile;		/* target file */
-  JSAMPROW row_pointer[1];	/* pointer to JSAMPLE row[s] */
-  int row_stride;		/* physical row width in image buffer */
-
-  /* Step 1: allocate and initialize JPEG compression object */
-
-  /* We have to set up the error handler first, in case the initialization
-   * step fails.  (Unlikely, but it could happen if you are out of memory.)
-   * This routine fills in the contents of struct jerr, and returns jerr's
-   * address which we place into the link field in cinfo.
-   */
-  cinfo.err = jpeg_std_error(&jerr);
-  /* Now we can initialize the JPEG compression object. */
-  jpeg_create_compress(&cinfo);
-
-  /* Step 2: specify data destination (eg, a file) */
-  /* Note: steps 2 and 3 can be done in either order. */
-
-  /* Here we use the library-supplied code to send compressed data to a
-   * stdio stream.  You can also write your own code to do something else.
-   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
-   * requires it in order to write binary files.
-   */
-  if ((outfile = fopen(filename, "wb")) == NULL) {
-    fprintf(stderr, "can't open %s\n", filename);
-    exit(1);
-  }
-  jpeg_stdio_dest(&cinfo, outfile);
-
-  /* Step 3: set parameters for compression */
-
-  /* First we supply a description of the input image.
-   * Four fields of the cinfo struct must be filled in:
-   */
-  cinfo.image_width = image_width; 	/* image width and height, in pixels */
-  cinfo.image_height = image_height;
-  cinfo.input_components = 3;		/* # of color components per pixel */
-  cinfo.in_color_space = JCS_RGB; 	/* colorspace of input image */
-  /* Now use the library's routine to set default compression parameters.
-   * (You must set at least cinfo.in_color_space before calling this,
-   * since the defaults depend on the source color space.)
-   */
-  jpeg_set_defaults(&cinfo);
-  /* Now you can set any non-default parameters you wish to.
-   * Here we just illustrate the use of quality (quantization table) scaling:
-   */
-  jpeg_set_quality(&cinfo, quality, TRUE /* limit to baseline-JPEG values */);
-
-  /* Step 4: Start compressor */
-
-  /* TRUE ensures that we will write a complete interchange-JPEG file.
-   * Pass TRUE unless you are very sure of what you're doing.
-   */
-  jpeg_start_compress(&cinfo, TRUE);
-
-  /* Step 5: while (scan lines remain to be written) */
-  /*           jpeg_write_scanlines(...); */
-
-  /* Here we use the library's state variable cinfo.next_scanline as the
-   * loop counter, so that we don't have to keep track ourselves.
-   * To keep things simple, we pass one scanline per call; you can pass
-   * more if you wish, though.
-   */
-  row_stride = image_width * 3;	/* JSAMPLEs per row in image_buffer */
-
-  while (cinfo.next_scanline < cinfo.image_height) {
-    /* jpeg_write_scanlines expects an array of pointers to scanlines.
-     * Here the array is only one element long, but you could pass
-     * more than one scanline at a time if that's more convenient.
-     */
-    row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
-    (void) jpeg_write_scanlines(&cinfo, row_pointer, 1);
-  }
-
-  /* Step 6: Finish compression */
-
-  jpeg_finish_compress(&cinfo);
-  /* After finish_compress, we can close the output file. */
-  fclose(outfile);
-
-  /* Step 7: release JPEG compression object */
-
-  /* This is an important step since it will release a good deal of memory. */
-  jpeg_destroy_compress(&cinfo);
-
-  /* And we're done! */
-}
-
-
-/*
- * SOME FINE POINTS:
- *
- * In the above loop, we ignored the return value of jpeg_write_scanlines,
- * which is the number of scanlines actually written.  We could get away
- * with this because we were only relying on the value of cinfo.next_scanline,
- * which will be incremented correctly.  If you maintain additional loop
- * variables then you should be careful to increment them properly.
- * Actually, for output to a stdio stream you needn't worry, because
- * then jpeg_write_scanlines will write all the lines passed (or else exit
- * with a fatal error).  Partial writes can only occur if you use a data
- * destination module that can demand suspension of the compressor.
- * (If you don't know what that's for, you don't need it.)
- *
- * If the compressor requires full-image buffers (for entropy-coding
- * optimization or a multi-scan JPEG file), it will create temporary
- * files for anything that doesn't fit within the maximum-memory setting.
- * (Note that temp files are NOT needed if you use the default parameters.)
- * On some systems you may need to set up a signal handler to ensure that
- * temporary files are deleted if the program is interrupted.  See libjpeg.doc.
- *
- * Scanlines MUST be supplied in top-to-bottom order if you want your JPEG
- * files to be compatible with everyone else's.  If you cannot readily read
- * your data in that order, you'll need an intermediate array to hold the
- * image.  See rdtarga.c or rdbmp.c for examples of handling bottom-to-top
- * source data using the JPEG code's internal virtual-array mechanisms.
- */
-
-
-
-/******************** JPEG DECOMPRESSION SAMPLE INTERFACE *******************/
-
-/* This half of the example shows how to read data from the JPEG decompressor.
- * It's a bit more refined than the above, in that we show:
- *   (a) how to modify the JPEG library's standard error-reporting behavior;
- *   (b) how to allocate workspace using the library's memory manager.
- *
- * Just to make this example a little different from the first one, we'll
- * assume that we do not intend to put the whole image into an in-memory
- * buffer, but to send it line-by-line someplace else.  We need a one-
- * scanline-high JSAMPLE array as a work buffer, and we will let the JPEG
- * memory manager allocate it for us.  This approach is actually quite useful
- * because we don't need to remember to deallocate the buffer separately: it
- * will go away automatically when the JPEG object is cleaned up.
- */
-
-
-/*
- * ERROR HANDLING:
- *
- * The JPEG library's standard error handler (jerror.c) is divided into
- * several "methods" which you can override individually.  This lets you
- * adjust the behavior without duplicating a lot of code, which you might
- * have to update with each future release.
- *
- * Our example here shows how to override the "error_exit" method so that
- * control is returned to the library's caller when a fatal error occurs,
- * rather than calling exit() as the standard error_exit method does.
- *
- * We use C's setjmp/longjmp facility to return control.  This means that the
- * routine which calls the JPEG library must first execute a setjmp() call to
- * establish the return point.  We want the replacement error_exit to do a
- * longjmp().  But we need to make the setjmp buffer accessible to the
- * error_exit routine.  To do this, we make a private extension of the
- * standard JPEG error handler object.  (If we were using C++, we'd say we
- * were making a subclass of the regular error handler.)
- *
- * Here's the extended error handler struct:
- */
-
-struct my_error_mgr {
-  struct jpeg_error_mgr pub;	/* "public" fields */
-
-  jmp_buf setjmp_buffer;	/* for return to caller */
-};
-
-typedef struct my_error_mgr * my_error_ptr;
-
-/*
- * Here's the routine that will replace the standard error_exit method:
- */
-
-METHODDEF(void)
-my_error_exit (j_common_ptr cinfo)
-{
-  /* cinfo->err really points to a my_error_mgr struct, so coerce pointer */
-  my_error_ptr myerr = (my_error_ptr) cinfo->err;
-
-  /* Always display the message. */
-  /* We could postpone this until after returning, if we chose. */
-  (*cinfo->err->output_message) (cinfo);
-
-  /* Return control to the setjmp point */
-  longjmp(myerr->setjmp_buffer, 1);
-}
-
-
-/*
- * Sample routine for JPEG decompression.  We assume that the source file name
- * is passed in.  We want to return 1 on success, 0 on error.
- */
-
-
-GLOBAL(int)
-read_JPEG_file (char * filename)
-{
-  /* This struct contains the JPEG decompression parameters and pointers to
-   * working space (which is allocated as needed by the JPEG library).
-   */
-  struct jpeg_decompress_struct cinfo;
-  /* We use our private extension JPEG error handler.
-   * Note that this struct must live as long as the main JPEG parameter
-   * struct, to avoid dangling-pointer problems.
-   */
-  struct my_error_mgr jerr;
-  /* More stuff */
-  FILE * infile;		/* source file */
-  JSAMPARRAY buffer;		/* Output row buffer */
-  int row_stride;		/* physical row width in output buffer */
-
-  /* In this example we want to open the input file before doing anything else,
-   * so that the setjmp() error recovery below can assume the file is open.
-   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
-   * requires it in order to read binary files.
-   */
-
-  if ((infile = fopen(filename, "rb")) == NULL) {
-    fprintf(stderr, "can't open %s\n", filename);
-    return 0;
-  }
-
-  /* Step 1: allocate and initialize JPEG decompression object */
-
-  /* We set up the normal JPEG error routines, then override error_exit. */
-  cinfo.err = jpeg_std_error(&jerr.pub);
-  jerr.pub.error_exit = my_error_exit;
-  /* Establish the setjmp return context for my_error_exit to use. */
-  if (setjmp(jerr.setjmp_buffer)) {
-    /* If we get here, the JPEG code has signaled an error.
-     * We need to clean up the JPEG object, close the input file, and return.
-     */
-    jpeg_destroy_decompress(&cinfo);
-    fclose(infile);
-    return 0;
-  }
-  /* Now we can initialize the JPEG decompression object. */
-  jpeg_create_decompress(&cinfo);
-
-  /* Step 2: specify data source (eg, a file) */
-
-  jpeg_stdio_src(&cinfo, infile);
-
-  /* Step 3: read file parameters with jpeg_read_header() */
-
-  (void) jpeg_read_header(&cinfo, TRUE);
-  /* We can ignore the return value from jpeg_read_header since
-   *   (a) suspension is not possible with the stdio data source, and
-   *   (b) we passed TRUE to reject a tables-only JPEG file as an error.
-   * See libjpeg.doc for more info.
-   */
-
-  /* Step 4: set parameters for decompression */
-
-  /* In this example, we don't need to change any of the defaults set by
-   * jpeg_read_header(), so we do nothing here.
-   */
-
-  /* Step 5: Start decompressor */
-
-  (void) jpeg_start_decompress(&cinfo);
-  /* We can ignore the return value since suspension is not possible
-   * with the stdio data source.
-   */
-
-  /* We may need to do some setup of our own at this point before reading
-   * the data.  After jpeg_start_decompress() we have the correct scaled
-   * output image dimensions available, as well as the output colormap
-   * if we asked for color quantization.
-   * In this example, we need to make an output work buffer of the right size.
-   */ 
-  /* JSAMPLEs per row in output buffer */
-  row_stride = cinfo.output_width * cinfo.output_components;
-  /* Make a one-row-high sample array that will go away when done with image */
-  buffer = (*cinfo.mem->alloc_sarray)
-		((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
-
-  /* Step 6: while (scan lines remain to be read) */
-  /*           jpeg_read_scanlines(...); */
-
-  /* Here we use the library's state variable cinfo.output_scanline as the
-   * loop counter, so that we don't have to keep track ourselves.
-   */
-  while (cinfo.output_scanline < cinfo.output_height) {
-    /* jpeg_read_scanlines expects an array of pointers to scanlines.
-     * Here the array is only one element long, but you could ask for
-     * more than one scanline at a time if that's more convenient.
-     */
-    (void) jpeg_read_scanlines(&cinfo, buffer, 1);
-    /* Assume put_scanline_someplace wants a pointer and sample count. */
-    put_scanline_someplace(buffer[0], row_stride);
-  }
-
-  /* Step 7: Finish decompression */
-
-  (void) jpeg_finish_decompress(&cinfo);
-  /* We can ignore the return value since suspension is not possible
-   * with the stdio data source.
-   */
-
-  /* Step 8: Release JPEG decompression object */
-
-  /* This is an important step since it will release a good deal of memory. */
-  jpeg_destroy_decompress(&cinfo);
-
-  /* After finish_decompress, we can close the input file.
-   * Here we postpone it until after no more JPEG errors are possible,
-   * so as to simplify the setjmp error logic above.  (Actually, I don't
-   * think that jpeg_destroy can do an error exit, but why assume anything...)
-   */
-  fclose(infile);
-
-  /* At this point you may want to check to see whether any corrupt-data
-   * warnings occurred (test whether jerr.pub.num_warnings is nonzero).
-   */
-
-  /* And we're done! */
-  return 1;
-}
-
-
-/*
- * SOME FINE POINTS:
- *
- * In the above code, we ignored the return value of jpeg_read_scanlines,
- * which is the number of scanlines actually read.  We could get away with
- * this because we asked for only one line at a time and we weren't using
- * a suspending data source.  See libjpeg.doc for more info.
- *
- * We cheated a bit by calling alloc_sarray() after jpeg_start_decompress();
- * we should have done it beforehand to ensure that the space would be
- * counted against the JPEG max_memory setting.  In some systems the above
- * code would risk an out-of-memory error.  However, in general we don't
- * know the output image dimensions before jpeg_start_decompress(), unless we
- * call jpeg_calc_output_dimensions().  See libjpeg.doc for more about this.
- *
- * Scanlines are returned in the same order as they appear in the JPEG file,
- * which is standardly top-to-bottom.  If you must emit data bottom-to-top,
- * you can use one of the virtual arrays provided by the JPEG memory manager
- * to invert the data.  See wrbmp.c for an example.
- *
- * As with compression, some operating modes may require temporary files.
- * On some systems you may need to set up a signal handler to ensure that
- * temporary files are deleted if the program is interrupted.  See libjpeg.doc.
- */
diff --git a/jpeg/filelist.doc b/jpeg/filelist.doc
deleted file mode 100644
index e14982ca55b9..000000000000
--- a/jpeg/filelist.doc
+++ /dev/null
@@ -1,210 +0,0 @@
-IJG JPEG LIBRARY:  FILE LIST
-
-Copyright (C) 1994-1998, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-Here is a road map to the files in the IJG JPEG distribution.  The
-distribution includes the JPEG library proper, plus two application
-programs ("cjpeg" and "djpeg") which use the library to convert JPEG
-files to and from some other popular image formats.  A third application
-"jpegtran" uses the library to do lossless conversion between different
-variants of JPEG.  There are also two stand-alone applications,
-"rdjpgcom" and "wrjpgcom".
-
-
-THE JPEG LIBRARY
-================
-
-Include files:
-
-jpeglib.h	JPEG library's exported data and function declarations.
-jconfig.h	Configuration declarations.  Note: this file is not present
-		in the distribution; it is generated during installation.
-jmorecfg.h	Additional configuration declarations; need not be changed
-		for a standard installation.
-jerror.h	Declares JPEG library's error and trace message codes.
-jinclude.h	Central include file used by all IJG .c files to reference
-		system include files.
-jpegint.h	JPEG library's internal data structures.
-jchuff.h	Private declarations for Huffman encoder modules.
-jdhuff.h	Private declarations for Huffman decoder modules.
-jdct.h		Private declarations for forward & reverse DCT subsystems.
-jmemsys.h	Private declarations for memory management subsystem.
-jversion.h	Version information.
-
-Applications using the library should include jpeglib.h (which in turn
-includes jconfig.h and jmorecfg.h).  Optionally, jerror.h may be included
-if the application needs to reference individual JPEG error codes.  The
-other include files are intended for internal use and would not normally
-be included by an application program.  (cjpeg/djpeg/etc do use jinclude.h,
-since its function is to improve portability of the whole IJG distribution.
-Most other applications will directly include the system include files they
-want, and hence won't need jinclude.h.)
-
-
-C source code files:
-
-These files contain most of the functions intended to be called directly by
-an application program:
-
-jcapimin.c	Application program interface: core routines for compression.
-jcapistd.c	Application program interface: standard compression.
-jdapimin.c	Application program interface: core routines for decompression.
-jdapistd.c	Application program interface: standard decompression.
-jcomapi.c	Application program interface routines common to compression
-		and decompression.
-jcparam.c	Compression parameter setting helper routines.
-jctrans.c	API and library routines for transcoding compression.
-jdtrans.c	API and library routines for transcoding decompression.
-
-Compression side of the library:
-
-jcinit.c	Initialization: determines which other modules to use.
-jcmaster.c	Master control: setup and inter-pass sequencing logic.
-jcmainct.c	Main buffer controller (preprocessor => JPEG compressor).
-jcprepct.c	Preprocessor buffer controller.
-jccoefct.c	Buffer controller for DCT coefficient buffer.
-jccolor.c	Color space conversion.
-jcsample.c	Downsampling.
-jcdctmgr.c	DCT manager (DCT implementation selection & control).
-jfdctint.c	Forward DCT using slow-but-accurate integer method.
-jfdctfst.c	Forward DCT using faster, less accurate integer method.
-jfdctflt.c	Forward DCT using floating-point arithmetic.
-jchuff.c	Huffman entropy coding for sequential JPEG.
-jcphuff.c	Huffman entropy coding for progressive JPEG.
-jcmarker.c	JPEG marker writing.
-jdatadst.c	Data destination manager for stdio output.
-
-Decompression side of the library:
-
-jdmaster.c	Master control: determines which other modules to use.
-jdinput.c	Input controller: controls input processing modules.
-jdmainct.c	Main buffer controller (JPEG decompressor => postprocessor).
-jdcoefct.c	Buffer controller for DCT coefficient buffer.
-jdpostct.c	Postprocessor buffer controller.
-jdmarker.c	JPEG marker reading.
-jdhuff.c	Huffman entropy decoding for sequential JPEG.
-jdphuff.c	Huffman entropy decoding for progressive JPEG.
-jddctmgr.c	IDCT manager (IDCT implementation selection & control).
-jidctint.c	Inverse DCT using slow-but-accurate integer method.
-jidctfst.c	Inverse DCT using faster, less accurate integer method.
-jidctflt.c	Inverse DCT using floating-point arithmetic.
-jidctred.c	Inverse DCTs with reduced-size outputs.
-jdsample.c	Upsampling.
-jdcolor.c	Color space conversion.
-jdmerge.c	Merged upsampling/color conversion (faster, lower quality).
-jquant1.c	One-pass color quantization using a fixed-spacing colormap.
-jquant2.c	Two-pass color quantization using a custom-generated colormap.
-		Also handles one-pass quantization to an externally given map.
-jdatasrc.c	Data source manager for stdio input.
-
-Support files for both compression and decompression:
-
-jerror.c	Standard error handling routines (application replaceable).
-jmemmgr.c	System-independent (more or less) memory management code.
-jutils.c	Miscellaneous utility routines.
-
-jmemmgr.c relies on a system-dependent memory management module.  The IJG
-distribution includes the following implementations of the system-dependent
-module:
-
-jmemnobs.c	"No backing store": assumes adequate virtual memory exists.
-jmemansi.c	Makes temporary files with ANSI-standard routine tmpfile().
-jmemname.c	Makes temporary files with program-generated file names.
-jmemdos.c	Custom implementation for MS-DOS (16-bit environment only):
-		can use extended and expanded memory as well as temp files.
-jmemmac.c	Custom implementation for Apple Macintosh.
-
-Exactly one of the system-dependent modules should be configured into an
-installed JPEG library (see install.doc for hints about which one to use).
-On unusual systems you may find it worthwhile to make a special
-system-dependent memory manager.
-
-
-Non-C source code files:
-
-jmemdosa.asm	80x86 assembly code support for jmemdos.c; used only in
-		MS-DOS-specific configurations of the JPEG library.
-
-
-CJPEG/DJPEG/JPEGTRAN
-====================
-
-Include files:
-
-cdjpeg.h	Declarations shared by cjpeg/djpeg/jpegtran modules.
-cderror.h	Additional error and trace message codes for cjpeg et al.
-transupp.h	Declarations for jpegtran support routines in transupp.c.
-
-C source code files:
-
-cjpeg.c		Main program for cjpeg.
-djpeg.c		Main program for djpeg.
-jpegtran.c	Main program for jpegtran.
-cdjpeg.c	Utility routines used by all three programs.
-rdcolmap.c	Code to read a colormap file for djpeg's "-map" switch.
-rdswitch.c	Code to process some of cjpeg's more complex switches.
-		Also used by jpegtran.
-transupp.c	Support code for jpegtran: lossless image manipulations.
-
-Image file reader modules for cjpeg:
-
-rdbmp.c		BMP file input.
-rdgif.c		GIF file input (now just a stub).
-rdppm.c		PPM/PGM file input.
-rdrle.c		Utah RLE file input.
-rdtarga.c	Targa file input.
-
-Image file writer modules for djpeg:
-
-wrbmp.c		BMP file output.
-wrgif.c		GIF file output (a mere shadow of its former self).
-wrppm.c		PPM/PGM file output.
-wrrle.c		Utah RLE file output.
-wrtarga.c	Targa file output.
-
-
-RDJPGCOM/WRJPGCOM
-=================
-
-C source code files:
-
-rdjpgcom.c	Stand-alone rdjpgcom application.
-wrjpgcom.c	Stand-alone wrjpgcom application.
-
-These programs do not depend on the IJG library.  They do use
-jconfig.h and jinclude.h, only to improve portability.
-
-
-ADDITIONAL FILES
-================
-
-Documentation (see README for a guide to the documentation files):
-
-README		Master documentation file.
-*.doc		Other documentation files.
-*.1		Documentation in Unix man page format.
-change.log	Version-to-version change highlights.
-example.c	Sample code for calling JPEG library.
-
-Configuration/installation files and programs (see install.doc for more info):
-
-configure	Unix shell script to perform automatic configuration.
-ltconfig	Support scripts for configure (from GNU libtool).
-ltmain.sh
-config.guess
-config.sub
-install-sh	Install shell script for those Unix systems lacking one.
-ckconfig.c	Program to generate jconfig.h on non-Unix systems.
-jconfig.doc	Template for making jconfig.h by hand.
-makefile.*	Sample makefiles for particular systems.
-jconfig.*	Sample jconfig.h for particular systems.
-ansi2knr.c	De-ANSIfier for pre-ANSI C compilers (courtesy of
-		L. Peter Deutsch and Aladdin Enterprises).
-
-Test files (see install.doc for test procedure):
-
-test*.*		Source and comparison files for confidence test.
-		These are binary image files, NOT text files.
diff --git a/jpeg/install.doc b/jpeg/install.doc
deleted file mode 100644
index 3702b986b6f9..000000000000
--- a/jpeg/install.doc
+++ /dev/null
@@ -1,1063 +0,0 @@
-INSTALLATION INSTRUCTIONS for the Independent JPEG Group's JPEG software
-
-Copyright (C) 1991-1998, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-This file explains how to configure and install the IJG software.  We have
-tried to make this software extremely portable and flexible, so that it can be
-adapted to almost any environment.  The downside of this decision is that the
-installation process is complicated.  We have provided shortcuts to simplify
-the task on common systems.  But in any case, you will need at least a little
-familiarity with C programming and program build procedures for your system.
-
-If you are only using this software as part of a larger program, the larger
-program's installation procedure may take care of configuring the IJG code.
-For example, Ghostscript's installation script will configure the IJG code.
-You don't need to read this file if you just want to compile Ghostscript.
-
-If you are on a Unix machine, you may not need to read this file at all.
-Try doing
-	./configure
-	make
-	make test
-If that doesn't complain, do
-	make install
-(better do "make -n install" first to see if the makefile will put the files
-where you want them).  Read further if you run into snags or want to customize
-the code for your system.
-
-
-TABLE OF CONTENTS
------------------
-
-Before you start
-Configuring the software:
-	using the automatic "configure" script
-	using one of the supplied jconfig and makefile files
-	by hand
-Building the software
-Testing the software
-Installing the software
-Optional stuff
-Optimization
-Hints for specific systems
-
-
-BEFORE YOU START
-================
-
-Before installing the software you must unpack the distributed source code.
-Since you are reading this file, you have probably already succeeded in this
-task.  However, there is a potential for error if you needed to convert the
-files to the local standard text file format (for example, if you are on
-MS-DOS you may have converted LF end-of-line to CR/LF).  You must apply
-such conversion to all the files EXCEPT those whose names begin with "test".
-The test files contain binary data; if you change them in any way then the
-self-test will give bad results.
-
-Please check the last section of this file to see if there are hints for the
-specific machine or compiler you are using.
-
-
-CONFIGURING THE SOFTWARE
-========================
-
-To configure the IJG code for your system, you need to create two files:
-  * jconfig.h: contains values for system-dependent #define symbols.
-  * Makefile: controls the compilation process.
-(On a non-Unix machine, you may create "project files" or some other
-substitute for a Makefile.  jconfig.h is needed in any environment.)
-
-We provide three different ways to generate these files:
-  * On a Unix system, you can just run the "configure" script.
-  * We provide sample jconfig files and makefiles for popular machines;
-    if your machine matches one of the samples, just copy the right sample
-    files to jconfig.h and Makefile.
-  * If all else fails, read the instructions below and make your own files.
-
-
-Configuring the software using the automatic "configure" script
----------------------------------------------------------------
-
-If you are on a Unix machine, you can just type
-	./configure
-and let the configure script construct appropriate configuration files.
-If you're using "csh" on an old version of System V, you might need to type
-	sh configure
-instead to prevent csh from trying to execute configure itself.
-Expect configure to run for a few minutes, particularly on slower machines;
-it works by compiling a series of test programs.
-
-Configure was created with GNU Autoconf and it follows the usual conventions
-for GNU configure scripts.  It makes a few assumptions that you may want to
-override.  You can do this by providing optional switches to configure:
-
-* If you want to build libjpeg as a shared library, say
-	./configure --enable-shared
-To get both shared and static libraries, say
-	./configure --enable-shared --enable-static
-Note that these switches invoke GNU libtool to take care of system-dependent
-shared library building methods.  If things don't work this way, please try
-running configure without either switch; that should build a static library
-without using libtool.  If that works, your problem is probably with libtool
-not with the IJG code.  libtool is fairly new and doesn't support all flavors
-of Unix yet.  (You might be able to find a newer version of libtool than the
-one included with libjpeg; see ftp.gnu.org.  Report libtool problems to
-bug-libtool@gnu.org.)
-
-* Configure will use gcc (GNU C compiler) if it's available, otherwise cc.
-To force a particular compiler to be selected, use the CC option, for example
-	./configure CC='cc'
-The same method can be used to include any unusual compiler switches.
-For example, on HP-UX you probably want to say
-	./configure CC='cc -Aa'
-to get HP's compiler to run in ANSI mode.
-
-* The default CFLAGS setting is "-O" for non-gcc compilers, "-O2" for gcc.
-You can override this by saying, for example,
-	./configure CFLAGS='-g'
-if you want to compile with debugging support.
-
-* Configure will set up the makefile so that "make install" will install files
-into /usr/local/bin, /usr/local/man, etc.  You can specify an installation
-prefix other than "/usr/local" by giving configure the option "--prefix=PATH".
-
-* If you don't have a lot of swap space, you may need to enable the IJG
-software's internal virtual memory mechanism.  To do this, give the option
-"--enable-maxmem=N" where N is the default maxmemory limit in megabytes.
-This is discussed in more detail under "Selecting a memory manager", below.
-You probably don't need to worry about this on reasonably-sized Unix machines,
-unless you plan to process very large images.
-
-Configure has some other features that are useful if you are cross-compiling
-or working in a network of multiple machine types; but if you need those
-features, you probably already know how to use them.
-
-
-Configuring the software using one of the supplied jconfig and makefile files
------------------------------------------------------------------------------
-
-If you have one of these systems, you can just use the provided configuration
-files:
-
-Makefile	jconfig file	System and/or compiler
-
-makefile.manx	jconfig.manx	Amiga, Manx Aztec C
-makefile.sas	jconfig.sas	Amiga, SAS C
-makeproj.mac	jconfig.mac	Apple Macintosh, Metrowerks CodeWarrior
-mak*jpeg.st	jconfig.st	Atari ST/STE/TT, Pure C or Turbo C
-makefile.bcc	jconfig.bcc	MS-DOS or OS/2, Borland C
-makefile.dj	jconfig.dj	MS-DOS, DJGPP (Delorie's port of GNU C)
-makefile.mc6	jconfig.mc6	MS-DOS, Microsoft C (16-bit only)
-makefile.wat	jconfig.wat	MS-DOS, OS/2, or Windows NT, Watcom C
-makefile.vc	jconfig.vc	Windows NT/95, MS Visual C++
-make*.ds	jconfig.vc	Windows NT/95, MS Developer Studio
-makefile.mms	jconfig.vms	Digital VMS, with MMS software
-makefile.vms	jconfig.vms	Digital VMS, without MMS software
-
-Copy the proper jconfig file to jconfig.h and the makefile to Makefile (or
-whatever your system uses as the standard makefile name).  For more info see
-the appropriate system-specific hints section near the end of this file.
-
-
-Configuring the software by hand
---------------------------------
-
-First, generate a jconfig.h file.  If you are moderately familiar with C,
-the comments in jconfig.doc should be enough information to do this; just
-copy jconfig.doc to jconfig.h and edit it appropriately.  Otherwise, you may
-prefer to use the ckconfig.c program.  You will need to compile and execute
-ckconfig.c by hand --- we hope you know at least enough to do that.
-ckconfig.c may not compile the first try (in fact, the whole idea is for it
-to fail if anything is going to).  If you get compile errors, fix them by
-editing ckconfig.c according to the directions given in ckconfig.c.  Once
-you get it to run, it will write a suitable jconfig.h file, and will also
-print out some advice about which makefile to use.
-
-You may also want to look at the canned jconfig files, if there is one for a
-system similar to yours.
-
-Second, select a makefile and copy it to Makefile (or whatever your system
-uses as the standard makefile name).  The most generic makefiles we provide
-are
-	makefile.ansi:	if your C compiler supports function prototypes
-	makefile.unix:	if not.
-(You have function prototypes if ckconfig.c put "#define HAVE_PROTOTYPES"
-in jconfig.h.)  You may want to start from one of the other makefiles if
-there is one for a system similar to yours.
-
-Look over the selected Makefile and adjust options as needed.  In particular
-you may want to change the CC and CFLAGS definitions.  For instance, if you
-are using GCC, set CC=gcc.  If you had to use any compiler switches to get
-ckconfig.c to work, make sure the same switches are in CFLAGS.
-
-If you are on a system that doesn't use makefiles, you'll need to set up
-project files (or whatever you do use) to compile all the source files and
-link them into executable files cjpeg, djpeg, jpegtran, rdjpgcom, and wrjpgcom.
-See the file lists in any of the makefiles to find out which files go into
-each program.  Note that the provided makefiles all make a "library" file
-libjpeg first, but you don't have to do that if you don't want to; the file
-lists identify which source files are actually needed for compression,
-decompression, or both.  As a last resort, you can make a batch script that
-just compiles everything and links it all together; makefile.vms is an example
-of this (it's for VMS systems that have no make-like utility).
-
-Here are comments about some specific configuration decisions you'll
-need to make:
-
-Command line style
-------------------
-
-These programs can use a Unix-like command line style which supports
-redirection and piping, like this:
-	cjpeg inputfile >outputfile
-	cjpeg <inputfile >outputfile
-	source program | cjpeg >outputfile
-The simpler "two file" command line style is just
-	cjpeg inputfile outputfile
-You may prefer the two-file style, particularly if you don't have pipes.
-
-You MUST use two-file style on any system that doesn't cope well with binary
-data fed through stdin/stdout; this is true for some MS-DOS compilers, for
-example.  If you're not on a Unix system, it's safest to assume you need
-two-file style.  (But if your compiler provides either the Posix-standard
-fdopen() library routine or a Microsoft-compatible setmode() routine, you
-can safely use the Unix command line style, by defining USE_FDOPEN or
-USE_SETMODE respectively.)
-
-To use the two-file style, make jconfig.h say "#define TWO_FILE_COMMANDLINE".
-
-Selecting a memory manager
---------------------------
-
-The IJG code is capable of working on images that are too big to fit in main
-memory; data is swapped out to temporary files as necessary.  However, the
-code to do this is rather system-dependent.  We provide five different
-memory managers:
-
-* jmemansi.c	This version uses the ANSI-standard library routine tmpfile(),
-		which not all non-ANSI systems have.  On some systems
-		tmpfile() may put the temporary file in a non-optimal
-		location; if you don't like what it does, use jmemname.c.
-
-* jmemname.c	This version creates named temporary files.  For anything
-		except a Unix machine, you'll need to configure the
-		select_file_name() routine appropriately; see the comments
-		near the head of jmemname.c.  If you use this version, define
-		NEED_SIGNAL_CATCHER in jconfig.h to make sure the temp files
-		are removed if the program is aborted.
-
-* jmemnobs.c	(That stands for No Backing Store :-).)  This will compile on
-		almost any system, but it assumes you have enough main memory
-		or virtual memory to hold the biggest images you work with.
-
-* jmemdos.c	This should be used with most 16-bit MS-DOS compilers.
-		See the system-specific notes about MS-DOS for more info.
-		IMPORTANT: if you use this, define USE_MSDOS_MEMMGR in
-		jconfig.h, and include the assembly file jmemdosa.asm in the
-		programs.  The supplied makefiles and jconfig files for
-		16-bit MS-DOS compilers already do both.
-
-* jmemmac.c	Custom version for Apple Macintosh; see the system-specific
-		notes for Macintosh for more info.
-
-To use a particular memory manager, change the SYSDEPMEM variable in your
-makefile to equal the corresponding object file name (for example, jmemansi.o
-or jmemansi.obj for jmemansi.c).
-
-If you have plenty of (real or virtual) main memory, just use jmemnobs.c.
-"Plenty" means about ten bytes for every pixel in the largest images
-you plan to process, so a lot of systems don't meet this criterion.
-If yours doesn't, try jmemansi.c first.  If that doesn't compile, you'll have
-to use jmemname.c; be sure to adjust select_file_name() for local conditions.
-You may also need to change unlink() to remove() in close_backing_store().
-
-Except with jmemnobs.c or jmemmac.c, you need to adjust the DEFAULT_MAX_MEM
-setting to a reasonable value for your system (either by adding a #define for
-DEFAULT_MAX_MEM to jconfig.h, or by adding a -D switch to the Makefile).
-This value limits the amount of data space the program will attempt to
-allocate.  Code and static data space isn't counted, so the actual memory
-needs for cjpeg or djpeg are typically 100 to 150Kb more than the max-memory
-setting.  Larger max-memory settings reduce the amount of I/O needed to
-process a large image, but too large a value can result in "insufficient
-memory" failures.  On most Unix machines (and other systems with virtual
-memory), just set DEFAULT_MAX_MEM to several million and forget it.  At the
-other end of the spectrum, for MS-DOS machines you probably can't go much
-above 300K to 400K.  (On MS-DOS the value refers to conventional memory only.
-Extended/expanded memory is handled separately by jmemdos.c.)
-
-
-BUILDING THE SOFTWARE
-=====================
-
-Now you should be able to compile the software.  Just say "make" (or
-whatever's necessary to start the compilation).  Have a cup of coffee.
-
-Here are some things that could go wrong:
-
-If your compiler complains about undefined structures, you should be able to
-shut it up by putting "#define INCOMPLETE_TYPES_BROKEN" in jconfig.h.
-
-If you have trouble with missing system include files or inclusion of the
-wrong ones, read jinclude.h.  This shouldn't happen if you used configure
-or ckconfig.c to set up jconfig.h.
-
-There are a fair number of routines that do not use all of their parameters;
-some compilers will issue warnings about this, which you can ignore.  There
-are also a few configuration checks that may give "unreachable code" warnings.
-Any other warning deserves investigation.
-
-If you don't have a getenv() library routine, define NO_GETENV.
-
-Also see the system-specific hints, below.
-
-
-TESTING THE SOFTWARE
-====================
-
-As a quick test of functionality we've included a small sample image in
-several forms:
-	testorig.jpg	Starting point for the djpeg tests.
-	testimg.ppm	The output of djpeg testorig.jpg
-	testimg.bmp	The output of djpeg -bmp -colors 256 testorig.jpg
-	testimg.jpg	The output of cjpeg testimg.ppm
-	testprog.jpg	Progressive-mode equivalent of testorig.jpg.
-	testimgp.jpg	The output of cjpeg -progressive -optimize testimg.ppm
-(The first- and second-generation .jpg files aren't identical since JPEG is
-lossy.)  If you can generate duplicates of the testimg* files then you
-probably have working programs.
-
-With most of the makefiles, "make test" will perform the necessary
-comparisons.
-
-If you're using a makefile that doesn't provide the test option, run djpeg
-and cjpeg by hand and compare the output files to testimg* with whatever
-binary file comparison tool you have.  The files should be bit-for-bit
-identical.
-
-If the programs complain "MAX_ALLOC_CHUNK is wrong, please fix", then you
-need to reduce MAX_ALLOC_CHUNK to a value that fits in type size_t.
-Try adding "#define MAX_ALLOC_CHUNK 65520L" to jconfig.h.  A less likely
-configuration error is "ALIGN_TYPE is wrong, please fix": defining ALIGN_TYPE
-as long should take care of that one.
-
-If the cjpeg test run fails with "Missing Huffman code table entry", it's a
-good bet that you needed to define RIGHT_SHIFT_IS_UNSIGNED.  Go back to the
-configuration step and run ckconfig.c.  (This is a good plan for any other
-test failure, too.)
-
-If you are using Unix (one-file) command line style on a non-Unix system,
-it's a good idea to check that binary I/O through stdin/stdout actually
-works.  You should get the same results from "djpeg <testorig.jpg >out.ppm"
-as from "djpeg -outfile out.ppm testorig.jpg".  Note that the makefiles all
-use the latter style and therefore do not exercise stdin/stdout!  If this
-check fails, try recompiling with USE_SETMODE or USE_FDOPEN defined.
-If it still doesn't work, better use two-file style.
-
-If you chose a memory manager other than jmemnobs.c, you should test that
-temporary-file usage works.  Try "djpeg -bmp -colors 256 -max 0 testorig.jpg"
-and make sure its output matches testimg.bmp.  If you have any really large
-images handy, try compressing them with -optimize and/or decompressing with
--colors 256 to make sure your DEFAULT_MAX_MEM setting is not too large.
-
-NOTE: this is far from an exhaustive test of the JPEG software; some modules,
-such as 1-pass color quantization, are not exercised at all.  It's just a
-quick test to give you some confidence that you haven't missed something
-major.
-
-
-INSTALLING THE SOFTWARE
-=======================
-
-Once you're done with the above steps, you can install the software by
-copying the executable files (cjpeg, djpeg, jpegtran, rdjpgcom, and wrjpgcom)
-to wherever you normally install programs.  On Unix systems, you'll also want
-to put the man pages (cjpeg.1, djpeg.1, jpegtran.1, rdjpgcom.1, wrjpgcom.1)
-in the man-page directory.  The pre-fab makefiles don't support this step
-since there's such a wide variety of installation procedures on different
-systems.
-
-If you generated a Makefile with the "configure" script, you can just say
-	make install
-to install the programs and their man pages into the standard places.
-(You'll probably need to be root to do this.)  We recommend first saying
-	make -n install
-to see where configure thought the files should go.  You may need to edit
-the Makefile, particularly if your system's conventions for man page
-filenames don't match what configure expects.
-
-If you want to install the IJG library itself, for use in compiling other
-programs besides ours, then you need to put the four include files
-	jpeglib.h jerror.h jconfig.h jmorecfg.h
-into your include-file directory, and put the library file libjpeg.a
-(extension may vary depending on system) wherever library files go.
-If you generated a Makefile with "configure", it will do what it thinks
-is the right thing if you say
-	make install-lib
-
-
-OPTIONAL STUFF
-==============
-
-Progress monitor:
-
-If you like, you can #define PROGRESS_REPORT (in jconfig.h) to enable display
-of percent-done progress reports.  The routine provided in cdjpeg.c merely
-prints percentages to stderr, but you can customize it to do something
-fancier.
-
-Utah RLE file format support:
-
-We distribute the software with support for RLE image files (Utah Raster
-Toolkit format) disabled, because the RLE support won't compile without the
-Utah library.  If you have URT version 3.1 or later, you can enable RLE
-support as follows:
-	1.  #define RLE_SUPPORTED in jconfig.h.
-	2.  Add a -I option to CFLAGS in the Makefile for the directory
-	    containing the URT .h files (typically the "include"
-	    subdirectory of the URT distribution).
-	3.  Add -L... -lrle to LDLIBS in the Makefile, where ... specifies
-	    the directory containing the URT "librle.a" file (typically the
-	    "lib" subdirectory of the URT distribution).
-
-Support for 12-bit-deep pixel data:
-
-The JPEG standard allows either 8-bit or 12-bit data precision.  (For color,
-this means 8 or 12 bits per channel, of course.)  If you need to work with
-deeper than 8-bit data, you can compile the IJG code for 12-bit operation.
-To do so:
-  1. In jmorecfg.h, define BITS_IN_JSAMPLE as 12 rather than 8.
-  2. In jconfig.h, undefine BMP_SUPPORTED, RLE_SUPPORTED, and TARGA_SUPPORTED,
-     because the code for those formats doesn't handle 12-bit data and won't
-     even compile.  (The PPM code does work, as explained below.  The GIF
-     code works too; it scales 8-bit GIF data to and from 12-bit depth
-     automatically.)
-  3. Compile.  Don't expect "make test" to pass, since the supplied test
-     files are for 8-bit data.
-
-Currently, 12-bit support does not work on 16-bit-int machines.
-
-Note that a 12-bit version will not read 8-bit JPEG files, nor vice versa;
-so you'll want to keep around a regular 8-bit compilation as well.
-(Run-time selection of data depth, to allow a single copy that does both,
-is possible but would probably slow things down considerably; it's very low
-on our to-do list.)
-
-The PPM reader (rdppm.c) can read 12-bit data from either text-format or
-binary-format PPM and PGM files.  Binary-format PPM/PGM files which have a
-maxval greater than 255 are assumed to use 2 bytes per sample, LSB first
-(little-endian order).  As of early 1995, 2-byte binary format is not
-officially supported by the PBMPLUS library, but it is expected that a
-future release of PBMPLUS will support it.  Note that the PPM reader will
-read files of any maxval regardless of the BITS_IN_JSAMPLE setting; incoming
-data is automatically rescaled to either maxval=255 or maxval=4095 as
-appropriate for the cjpeg bit depth.
-
-The PPM writer (wrppm.c) will normally write 2-byte binary PPM or PGM
-format, maxval 4095, when compiled with BITS_IN_JSAMPLE=12.  Since this
-format is not yet widely supported, you can disable it by compiling wrppm.c
-with PPM_NORAWWORD defined; then the data is scaled down to 8 bits to make a
-standard 1-byte/sample PPM or PGM file.  (Yes, this means still another copy
-of djpeg to keep around.  But hopefully you won't need it for very long.
-Poskanzer's supposed to get that new PBMPLUS release out Real Soon Now.)
-
-Of course, if you are working with 12-bit data, you probably have it stored
-in some other, nonstandard format.  In that case you'll probably want to
-write your own I/O modules to read and write your format.
-
-Note that a 12-bit version of cjpeg always runs in "-optimize" mode, in
-order to generate valid Huffman tables.  This is necessary because our
-default Huffman tables only cover 8-bit data.
-
-Removing code:
-
-If you need to make a smaller version of the JPEG software, some optional
-functions can be removed at compile time.  See the xxx_SUPPORTED #defines in
-jconfig.h and jmorecfg.h.  If at all possible, we recommend that you leave in
-decoder support for all valid JPEG files, to ensure that you can read anyone's
-output.  Taking out support for image file formats that you don't use is the
-most painless way to make the programs smaller.  Another possibility is to
-remove some of the DCT methods: in particular, the "IFAST" method may not be
-enough faster than the others to be worth keeping on your machine.  (If you
-do remove ISLOW or IFAST, be sure to redefine JDCT_DEFAULT or JDCT_FASTEST
-to a supported method, by adding a #define in jconfig.h.)
-
-
-OPTIMIZATION
-============
-
-Unless you own a Cray, you'll probably be interested in making the JPEG
-software go as fast as possible.  This section covers some machine-dependent
-optimizations you may want to try.  We suggest that before trying any of
-this, you first get the basic installation to pass the self-test step.
-Repeat the self-test after any optimization to make sure that you haven't
-broken anything.
-
-The integer DCT routines perform a lot of multiplications.  These
-multiplications must yield 32-bit results, but none of their input values
-are more than 16 bits wide.  On many machines, notably the 680x0 and 80x86
-CPUs, a 16x16=>32 bit multiply instruction is faster than a full 32x32=>32
-bit multiply.  Unfortunately there is no portable way to specify such a
-multiplication in C, but some compilers can generate one when you use the
-right combination of casts.  See the MULTIPLYxxx macro definitions in
-jdct.h.  If your compiler makes "int" be 32 bits and "short" be 16 bits,
-defining SHORTxSHORT_32 is fairly likely to work.  When experimenting with
-alternate definitions, be sure to test not only whether the code still works
-(use the self-test), but also whether it is actually faster --- on some
-compilers, alternate definitions may compute the right answer, yet be slower
-than the default.  Timing cjpeg on a large PGM (grayscale) input file is the
-best way to check this, as the DCT will be the largest fraction of the runtime
-in that mode.  (Note: some of the distributed compiler-specific jconfig files
-already contain #define switches to select appropriate MULTIPLYxxx
-definitions.)
-
-If your machine has sufficiently fast floating point hardware, you may find
-that the float DCT method is faster than the integer DCT methods, even
-after tweaking the integer multiply macros.  In that case you may want to
-make the float DCT be the default method.  (The only objection to this is
-that float DCT results may vary slightly across machines.)  To do that, add
-"#define JDCT_DEFAULT JDCT_FLOAT" to jconfig.h.  Even if you don't change
-the default, you should redefine JDCT_FASTEST, which is the method selected
-by djpeg's -fast switch.  Don't forget to update the documentation files
-(usage.doc and/or cjpeg.1, djpeg.1) to agree with what you've done.
-
-If access to "short" arrays is slow on your machine, it may be a win to
-define type JCOEF as int rather than short.  This will cost a good deal of
-memory though, particularly in some multi-pass modes, so don't do it unless
-you have memory to burn and short is REALLY slow.
-
-If your compiler can compile function calls in-line, make sure the INLINE
-macro in jmorecfg.h is defined as the keyword that marks a function
-inline-able.  Some compilers have a switch that tells the compiler to inline
-any function it thinks is profitable (e.g., -finline-functions for gcc).
-Enabling such a switch is likely to make the compiled code bigger but faster.
-
-In general, it's worth trying the maximum optimization level of your compiler,
-and experimenting with any optional optimizations such as loop unrolling.
-(Unfortunately, far too many compilers have optimizer bugs ... be prepared to
-back off if the code fails self-test.)  If you do any experimentation along
-these lines, please report the optimal settings to jpeg-info@uunet.uu.net so
-we can mention them in future releases.  Be sure to specify your machine and
-compiler version.
-
-
-HINTS FOR SPECIFIC SYSTEMS
-==========================
-
-We welcome reports on changes needed for systems not mentioned here.  Submit
-'em to jpeg-info@uunet.uu.net.  Also, if configure or ckconfig.c is wrong
-about how to configure the JPEG software for your system, please let us know.
-
-
-Acorn RISC OS:
-
-(Thanks to Simon Middleton for these hints on compiling with Desktop C.)
-After renaming the files according to Acorn conventions, take a copy of
-makefile.ansi, change all occurrences of 'libjpeg.a' to 'libjpeg.o' and
-change these definitions as indicated:
-
-CFLAGS= -throwback -IC: -Wn
-LDLIBS=C:o.Stubs
-SYSDEPMEM=jmemansi.o
-LN=Link
-AR=LibFile -c -o
-
-Also add a new line '.c.o:; $(cc) $< $(cflags) -c -o $@'.  Remove the
-lines '$(RM) libjpeg.o' and '$(AR2) libjpeg.o' and the 'jconfig.h'
-dependency section.
-
-Copy jconfig.doc to jconfig.h.  Edit jconfig.h to define TWO_FILE_COMMANDLINE
-and CHAR_IS_UNSIGNED.
-
-Run the makefile using !AMU not !Make.  If you want to use the 'clean' and
-'test' makefile entries then you will have to fiddle with the syntax a bit
-and rename the test files.
-
-
-Amiga:
-
-SAS C 6.50 reportedly is too buggy to compile the IJG code properly.
-A patch to update to 6.51 is available from SAS or AmiNet FTP sites.
-
-The supplied config files are set up to use jmemname.c as the memory
-manager, with temporary files being created on the device named by
-"JPEGTMP:".
-
-
-Atari ST/STE/TT:
- 
-Copy the project files makcjpeg.st, makdjpeg.st, maktjpeg.st, and makljpeg.st
-to cjpeg.prj, djpeg.prj, jpegtran.prj, and libjpeg.prj respectively.  The
-project files should work as-is with Pure C.  For Turbo C, change library
-filenames "pc..." to "tc..." in each project file.  Note that libjpeg.prj
-selects jmemansi.c as the recommended memory manager.  You'll probably want to
-adjust the DEFAULT_MAX_MEM setting --- you want it to be a couple hundred K
-less than your normal free memory.  Put "#define DEFAULT_MAX_MEM nnnn" into
-jconfig.h to do this.
-
-To use the 68881/68882 coprocessor for the floating point DCT, add the
-compiler option "-8" to the project files and replace pcfltlib.lib with
-pc881lib.lib in cjpeg.prj and djpeg.prj.  Or if you don't have a
-coprocessor, you may prefer to remove the float DCT code by undefining
-DCT_FLOAT_SUPPORTED in jmorecfg.h (since without a coprocessor, the float
-code will be too slow to be useful).  In that case, you can delete
-pcfltlib.lib from the project files.
-
-Note that you must make libjpeg.lib before making cjpeg.ttp, djpeg.ttp,
-or jpegtran.ttp.  You'll have to perform the self-test by hand.
-
-We haven't bothered to include project files for rdjpgcom and wrjpgcom.
-Those source files should just be compiled by themselves; they don't
-depend on the JPEG library.
-
-There is a bug in some older versions of the Turbo C library which causes the
-space used by temporary files created with "tmpfile()" not to be freed after
-an abnormal program exit.  If you check your disk afterwards, you will find
-cluster chains that are allocated but not used by a file.  This should not
-happen in cjpeg/djpeg/jpegtran, since we enable a signal catcher to explicitly
-close temp files before exiting.  But if you use the JPEG library with your
-own code, be sure to supply a signal catcher, or else use a different
-system-dependent memory manager.
-
-
-Cray:
-
-Should you be so fortunate as to be running JPEG on a Cray YMP, there is a
-compiler bug in old versions of Cray's Standard C (prior to 3.1).  If you
-still have an old compiler, you'll need to insert a line reading
-"#pragma novector" just before the loop	
-    for (i = 1; i <= (int) htbl->bits[l]; i++)
-      huffsize[p++] = (char) l;
-in fix_huff_tbl (in V5beta1, line 204 of jchuff.c and line 176 of jdhuff.c).
-[This bug may or may not still occur with the current IJG code, but it's
-probably a dead issue anyway...]
-
-
-HP-UX:
-
-If you have HP-UX 7.05 or later with the "software development" C compiler,
-you should run the compiler in ANSI mode.  If using the configure script,
-say
-	./configure CC='cc -Aa'
-(or -Ae if you prefer).  If configuring by hand, use makefile.ansi and add
-"-Aa" to the CFLAGS line in the makefile.
-
-If you have a pre-7.05 system, or if you are using the non-ANSI C compiler
-delivered with a minimum HP-UX system, then you must use makefile.unix
-(and do NOT add -Aa); or just run configure without the CC option.
-
-On HP 9000 series 800 machines, the HP C compiler is buggy in revisions prior
-to A.08.07.  If you get complaints about "not a typedef name", you'll have to
-use makefile.unix, or run configure without the CC option.
-
-
-Macintosh, generic comments:
-
-The supplied user-interface files (cjpeg.c, djpeg.c, etc) are set up to
-provide a Unix-style command line interface.  You can use this interface on
-the Mac by means of the ccommand() library routine provided by Metrowerks
-CodeWarrior or Think C.  This is only appropriate for testing the library,
-however; to make a user-friendly equivalent of cjpeg/djpeg you'd really want
-to develop a Mac-style user interface.  There isn't a complete example
-available at the moment, but there are some helpful starting points:
-1. Sam Bushell's free "To JPEG" applet provides drag-and-drop conversion to
-JPEG under System 7 and later.  This only illustrates how to use the
-compression half of the library, but it does a very nice job of that part.
-The CodeWarrior source code is available from http://www.pobox.com/~jsam.
-2. Jim Brunner prepared a Mac-style user interface for both compression and
-decompression.  Unfortunately, it hasn't been updated since IJG v4, and
-the library's API has changed considerably since then.  Still it may be of
-some help, particularly as a guide to compiling the IJG code under Think C.
-Jim's code is available from the Info-Mac archives, at sumex-aim.stanford.edu
-or mirrors thereof; see file /info-mac/dev/src/jpeg-convert-c.hqx.
-
-jmemmac.c is the recommended memory manager back end for Macintosh.  It uses
-NewPtr/DisposePtr instead of malloc/free, and has a Mac-specific
-implementation of jpeg_mem_available().  It also creates temporary files that
-follow Mac conventions.  (That part of the code relies on System-7-or-later OS
-functions.  See the comments in jmemmac.c if you need to run it on System 6.)
-NOTE that USE_MAC_MEMMGR must be defined in jconfig.h to use jmemmac.c.
-
-You can also use jmemnobs.c, if you don't care about handling images larger
-than available memory.  If you use any memory manager back end other than
-jmemmac.c, we recommend replacing "malloc" and "free" by "NewPtr" and
-"DisposePtr", because Mac C libraries often have peculiar implementations of
-malloc/free.  (For instance, free() may not return the freed space to the
-Mac Memory Manager.  This is undesirable for the IJG code because jmemmgr.c
-already clumps space requests.)
-
-
-Macintosh, Metrowerks CodeWarrior:
-
-The Unix-command-line-style interface can be used by defining USE_CCOMMAND.
-You'll also need to define TWO_FILE_COMMANDLINE to avoid stdin/stdout.
-This means that when using the cjpeg/djpeg programs, you'll have to type the
-input and output file names in the "Arguments" text-edit box, rather than
-using the file radio buttons.  (Perhaps USE_FDOPEN or USE_SETMODE would
-eliminate the problem, but I haven't heard from anyone who's tried it.)
-
-On 680x0 Macs, Metrowerks defines type "double" as a 10-byte IEEE extended
-float.  jmemmgr.c won't like this: it wants sizeof(ALIGN_TYPE) to be a power
-of 2.  Add "#define ALIGN_TYPE long" to jconfig.h to eliminate the complaint.
-
-The supplied configuration file jconfig.mac can be used for your jconfig.h;
-it includes all the recommended symbol definitions.  If you have AppleScript
-installed, you can run the supplied script makeproj.mac to create CodeWarrior
-project files for the library and the testbed applications, then build the
-library and applications.  (Thanks to Dan Sears and Don Agro for this nifty
-hack, which saves us from trying to maintain CodeWarrior project files as part
-of the IJG distribution...)
-
-
-Macintosh, Think C:
-
-The documentation in Jim Brunner's "JPEG Convert" source code (see above)
-includes detailed build instructions for Think C; it's probably somewhat
-out of date for the current release, but may be helpful.
-
-If you want to build the minimal command line version, proceed as follows.
-You'll have to prepare project files for the programs; we don't include any
-in the distribution since they are not text files.  Use the file lists in
-any of the supplied makefiles as a guide.  Also add the ANSI and Unix C
-libraries in a separate segment.  You may need to divide the JPEG files into
-more than one segment; we recommend dividing compression and decompression
-modules.  Define USE_CCOMMAND in jconfig.h so that the ccommand() routine is
-called.  You must also define TWO_FILE_COMMANDLINE because stdin/stdout
-don't handle binary data correctly.
-
-On 680x0 Macs, Think C defines type "double" as a 12-byte IEEE extended float.
-jmemmgr.c won't like this: it wants sizeof(ALIGN_TYPE) to be a power of 2.
-Add "#define ALIGN_TYPE long" to jconfig.h to eliminate the complaint.
-
-jconfig.mac should work as a jconfig.h configuration file for Think C,
-but the makeproj.mac AppleScript script is specific to CodeWarrior.  Sorry.
-
-
-MIPS R3000:
-
-MIPS's cc version 1.31 has a rather nasty optimization bug.  Don't use -O
-if you have that compiler version.  (Use "cc -V" to check the version.)
-Note that the R3000 chip is found in workstations from DEC and others.
-
-
-MS-DOS, generic comments for 16-bit compilers:
-
-The IJG code is designed to work well in 80x86 "small" or "medium" memory
-models (i.e., data pointers are 16 bits unless explicitly declared "far";
-code pointers can be either size).  You may be able to use small model to
-compile cjpeg or djpeg by itself, but you will probably have to use medium
-model for any larger application.  This won't make much difference in
-performance.  You *will* take a noticeable performance hit if you use a
-large-data memory model, and you should avoid "huge" model if at all
-possible.  Be sure that NEED_FAR_POINTERS is defined in jconfig.h if you use
-a small-data memory model; be sure it is NOT defined if you use a large-data
-model.  (The supplied makefiles and jconfig files for Borland and Microsoft C
-compile in medium model and define NEED_FAR_POINTERS.)
-
-The DOS-specific memory manager, jmemdos.c, should be used if possible.
-It needs some assembly-code routines which are in jmemdosa.asm; make sure
-your makefile assembles that file and includes it in the library.  If you
-don't have a suitable assembler, you can get pre-assembled object files for
-jmemdosa by FTP from ftp.uu.net:/graphics/jpeg/jdosaobj.zip.  (DOS-oriented
-distributions of the IJG source code often include these object files.)
-
-When using jmemdos.c, jconfig.h must define USE_MSDOS_MEMMGR and must set
-MAX_ALLOC_CHUNK to less than 64K (65520L is a typical value).  If your
-C library's far-heap malloc() can't allocate blocks that large, reduce
-MAX_ALLOC_CHUNK to whatever it can handle.
-
-If you can't use jmemdos.c for some reason --- for example, because you
-don't have an assembler to assemble jmemdosa.asm --- you'll have to fall
-back to jmemansi.c or jmemname.c.  You'll probably still need to set
-MAX_ALLOC_CHUNK in jconfig.h, because most DOS C libraries won't malloc()
-more than 64K at a time.  IMPORTANT: if you use jmemansi.c or jmemname.c,
-you will have to compile in a large-data memory model in order to get the
-right stdio library.  Too bad.
-
-wrjpgcom needs to be compiled in large model, because it malloc()s a 64KB
-work area to hold the comment text.  If your C library's malloc can't
-handle that, reduce MAX_COM_LENGTH as necessary in wrjpgcom.c.
-
-Most MS-DOS compilers treat stdin/stdout as text files, so you must use
-two-file command line style.  But if your compiler has either fdopen() or
-setmode(), you can use one-file style if you like.  To do this, define
-USE_SETMODE or USE_FDOPEN so that stdin/stdout will be set to binary mode.
-(USE_SETMODE seems to work with more DOS compilers than USE_FDOPEN.)  You
-should test that I/O through stdin/stdout produces the same results as I/O
-to explicitly named files... the "make test" procedures in the supplied
-makefiles do NOT use stdin/stdout.
-
-
-MS-DOS, generic comments for 32-bit compilers:
-
-None of the above comments about memory models apply if you are using a
-32-bit flat-memory-space environment, such as DJGPP or Watcom C.  (And you
-should use one if you have it, as performance will be much better than
-8086-compatible code!)  For flat-memory-space compilers, do NOT define
-NEED_FAR_POINTERS, and do NOT use jmemdos.c.  Use jmemnobs.c if the
-environment supplies adequate virtual memory, otherwise use jmemansi.c or
-jmemname.c.
-
-You'll still need to be careful about binary I/O through stdin/stdout.
-See the last paragraph of the previous section.
-
-
-MS-DOS, Borland C:
-
-Be sure to convert all the source files to DOS text format (CR/LF newlines).
-Although Borland C will often work OK with unmodified Unix (LF newlines)
-source files, sometimes it will give bogus compile errors.
-"Illegal character '#'" is the most common such error.  (This is true with
-Borland C 3.1, but perhaps is fixed in newer releases.)
-
-If you want one-file command line style, just undefine TWO_FILE_COMMANDLINE.
-jconfig.bcc already includes #define USE_SETMODE to make this work.
-(fdopen does not work correctly.)
-
-
-MS-DOS, Microsoft C:
-
-makefile.mc6 works with Microsoft C, DOS Visual C++, etc.  It should only
-be used if you want to build a 16-bit (small or medium memory model) program.
-
-If you want one-file command line style, just undefine TWO_FILE_COMMANDLINE.
-jconfig.mc6 already includes #define USE_SETMODE to make this work.
-(fdopen does not work correctly.)
-
-Note that this makefile assumes that the working copy of itself is called
-"makefile".  If you want to call it something else, say "makefile.mak",
-be sure to adjust the dependency line that reads "$(RFILE) : makefile".
-Otherwise the make will fail because it doesn't know how to create "makefile".
-Worse, some releases of Microsoft's make utilities give an incorrect error
-message in this situation.
-
-Old versions of MS C fail with an "out of macro expansion space" error
-because they can't cope with the macro TRACEMS8 (defined in jerror.h).
-If this happens to you, the easiest solution is to change TRACEMS8 to
-expand to nothing.  You'll lose the ability to dump out JPEG coefficient
-tables with djpeg -debug -debug, but at least you can compile.
-
-Original MS C 6.0 is very buggy; it compiles incorrect code unless you turn
-off optimization entirely (remove -O from CFLAGS).  6.00A is better, but it
-still generates bad code if you enable loop optimizations (-Ol or -Ox).
-
-MS C 8.0 crashes when compiling jquant1.c with optimization switch /Oo ...
-which is on by default.  To work around this bug, compile that one file
-with /Oo-.
-
-
-Microsoft Windows (all versions), generic comments:
-
-Some Windows system include files define typedef boolean as "unsigned char".
-The IJG code also defines typedef boolean, but we make it "int" by default.
-This doesn't affect the IJG programs because we don't import those Windows
-include files.  But if you use the JPEG library in your own program, and some
-of your program's files import one definition of boolean while some import the
-other, you can get all sorts of mysterious problems.  A good preventive step
-is to make the IJG library use "unsigned char" for boolean.  To do that,
-add something like this to your jconfig.h file:
-	/* Define "boolean" as unsigned char, not int, per Windows custom */
-	#ifndef __RPCNDR_H__	/* don't conflict if rpcndr.h already read */
-	typedef unsigned char boolean;
-	#endif
-	#define HAVE_BOOLEAN	/* prevent jmorecfg.h from redefining it */
-(This is already in jconfig.vc, by the way.)
-
-windef.h contains the declarations
-	#define far
-	#define FAR far
-Since jmorecfg.h tries to define FAR as empty, you may get a compiler
-warning if you include both jpeglib.h and windef.h (which windows.h
-includes).  To suppress the warning, you can put "#ifndef FAR"/"#endif"
-around the line "#define FAR" in jmorecfg.h.
-
-When using the library in a Windows application, you will almost certainly
-want to modify or replace the error handler module jerror.c, since our
-default error handler does a couple of inappropriate things:
-  1. it tries to write error and warning messages on stderr;
-  2. in event of a fatal error, it exits by calling exit().
-
-A simple stopgap solution for problem 1 is to replace the line
-	fprintf(stderr, "%s\n", buffer);
-(in output_message in jerror.c) with
-	MessageBox(GetActiveWindow(),buffer,"JPEG Error",MB_OK|MB_ICONERROR);
-It's highly recommended that you at least do that much, since otherwise
-error messages will disappear into nowhere.  (Beginning with IJG v6b, this
-code is already present in jerror.c; just define USE_WINDOWS_MESSAGEBOX in
-jconfig.h to enable it.)
-
-The proper solution for problem 2 is to return control to your calling
-application after a library error.  This can be done with the setjmp/longjmp
-technique discussed in libjpeg.doc and illustrated in example.c.  (NOTE:
-some older Windows C compilers provide versions of setjmp/longjmp that
-don't actually work under Windows.  You may need to use the Windows system
-functions Catch and Throw instead.)
-
-The recommended memory manager under Windows is jmemnobs.c; in other words,
-let Windows do any virtual memory management needed.  You should NOT use
-jmemdos.c nor jmemdosa.asm under Windows.
-
-For Windows 3.1, we recommend compiling in medium or large memory model;
-for newer Windows versions, use a 32-bit flat memory model.  (See the MS-DOS
-sections above for more info about memory models.)  In the 16-bit memory
-models only, you'll need to put
-	#define MAX_ALLOC_CHUNK 65520L	/* Maximum request to malloc() */
-into jconfig.h to limit allocation chunks to 64Kb.  (Without that, you'd
-have to use huge memory model, which slows things down unnecessarily.)
-jmemnobs.c works without modification in large or flat memory models, but to
-use medium model, you need to modify its jpeg_get_large and jpeg_free_large
-routines to allocate far memory.  In any case, you might like to replace
-its calls to malloc and free with direct calls on Windows memory allocation
-functions.
-
-You may also want to modify jdatasrc.c and jdatadst.c to use Windows file
-operations rather than fread/fwrite.  This is only necessary if your C
-compiler doesn't provide a competent implementation of C stdio functions.
-
-You might want to tweak the RGB_xxx macros in jmorecfg.h so that the library
-will accept or deliver color pixels in BGR sample order, not RGB; BGR order
-is usually more convenient under Windows.  Note that this change will break
-the sample applications cjpeg/djpeg, but the library itself works fine.
-
-
-Many people want to convert the IJG library into a DLL.  This is reasonably
-straightforward, but watch out for the following:
-
-  1. Don't try to compile as a DLL in small or medium memory model; use
-large model, or even better, 32-bit flat model.  Many places in the IJG code
-assume the address of a local variable is an ordinary (not FAR) pointer;
-that isn't true in a medium-model DLL.
-
-  2. Microsoft C cannot pass file pointers between applications and DLLs.
-(See Microsoft Knowledge Base, PSS ID Number Q50336.)  So jdatasrc.c and
-jdatadst.c don't work if you open a file in your application and then pass
-the pointer to the DLL.  One workaround is to make jdatasrc.c/jdatadst.c
-part of your main application rather than part of the DLL.
-
-  3. You'll probably need to modify the macros GLOBAL() and EXTERN() to
-attach suitable linkage keywords to the exported routine names.  Similarly,
-you'll want to modify METHODDEF() and JMETHOD() to ensure function pointers
-are declared in a way that lets application routines be called back through
-the function pointers.  These macros are in jmorecfg.h.  Typical definitions
-for a 16-bit DLL are:
-	#define GLOBAL(type)		type _far _pascal _loadds _export
-	#define EXTERN(type)		extern type _far _pascal _loadds
-	#define METHODDEF(type)		static type _far _pascal
-	#define JMETHOD(type,methodname,arglist)  \
-		type (_far _pascal *methodname) arglist
-For a 32-bit DLL you may want something like
-	#define GLOBAL(type)		__declspec(dllexport) type
-	#define EXTERN(type)		extern __declspec(dllexport) type
-Although not all the GLOBAL routines are actually intended to be called by
-the application, the performance cost of making them all DLL entry points is
-negligible.
-
-The unmodified IJG library presents a very C-specific application interface,
-so the resulting DLL is only usable from C or C++ applications.  There has
-been some talk of writing wrapper code that would present a simpler interface
-usable from other languages, such as Visual Basic.  This is on our to-do list
-but hasn't been very high priority --- any volunteers out there?
-
-
-Microsoft Windows, Borland C:
-
-The provided jconfig.bcc should work OK in a 32-bit Windows environment,
-but you'll need to tweak it in a 16-bit environment (you'd need to define
-NEED_FAR_POINTERS and MAX_ALLOC_CHUNK).  Beware that makefile.bcc will need
-alteration if you want to use it for Windows --- in particular, you should
-use jmemnobs.c not jmemdos.c under Windows.
-
-Borland C++ 4.5 fails with an internal compiler error when trying to compile
-jdmerge.c in 32-bit mode.  If enough people complain, perhaps Borland will fix
-it.  In the meantime, the simplest known workaround is to add a redundant
-definition of the variable range_limit in h2v1_merged_upsample(), at the head
-of the block that handles odd image width (about line 268 in v6 jdmerge.c):
-  /* If image width is odd, do the last output column separately */
-  if (cinfo->output_width & 1) {
-    register JSAMPLE * range_limit = cinfo->sample_range_limit; /* ADD THIS */
-    cb = GETJSAMPLE(*inptr1);
-Pretty bizarre, especially since the very similar routine h2v2_merged_upsample
-doesn't trigger the bug.
-Recent reports suggest that this bug does not occur with "bcc32a" (the
-Pentium-optimized version of the compiler).
-
-Another report from a user of Borland C 4.5 was that incorrect code (leading
-to a color shift in processed images) was produced if any of the following
-optimization switch combinations were used: 
-	-Ot -Og
-	-Ot -Op
-	-Ot -Om
-So try backing off on optimization if you see such a problem.  (Are there
-several different releases all numbered "4.5"??)
-
-
-Microsoft Windows, Microsoft Visual C++:
-
-jconfig.vc should work OK with any Microsoft compiler for a 32-bit memory
-model.  makefile.vc is intended for command-line use.  (If you are using
-the Developer Studio environment, you may prefer the DevStudio project
-files; see below.)
-
-Some users feel that it's easier to call the library from C++ code if you
-force VC++ to treat the library as C++ code, which you can do by renaming
-all the *.c files to *.cpp (and adjusting the makefile to match).  This
-avoids the need to put extern "C" { ... } around #include "jpeglib.h" in
-your C++ application.
-
-
-Microsoft Windows, Microsoft Developer Studio:
-
-We include makefiles that should work as project files in DevStudio 4.2 or
-later.  There is a library makefile that builds the IJG library as a static
-Win32 library, and an application makefile that builds the sample applications
-as Win32 console applications.  (Even if you only want the library, we
-recommend building the applications so that you can run the self-test.)
-
-To use:
-1. Copy jconfig.vc to jconfig.h, makelib.ds to jpeg.mak, and
-   makeapps.ds to apps.mak.  (Note that the renaming is critical!)
-2. Click on the .mak files to construct project workspaces.
-   (If you are using DevStudio more recent than 4.2, you'll probably
-   get a message saying that the makefiles are being updated.)
-3. Build the library project, then the applications project.
-4. Move the application .exe files from `app`\Release to an
-   appropriate location on your path.
-5. To perform the self-test, execute the command line
-	NMAKE /f makefile.vc  test
-
-
-OS/2, Borland C++:
-
-Watch out for optimization bugs in older Borland compilers; you may need
-to back off the optimization switch settings.  See the comments in
-makefile.bcc.
-
-
-SGI:
-
-On some SGI systems, you may need to set "AR2= ar -ts" in the Makefile.
-If you are using configure, you can do this by saying
-	./configure RANLIB='ar -ts'
-This change is not needed on all SGIs.  Use it only if the make fails at the
-stage of linking the completed programs.
-
-On the MIPS R4000 architecture (Indy, etc.), the compiler option "-mips2"
-reportedly speeds up the float DCT method substantially, enough to make it
-faster than the default int method (but still slower than the fast int
-method).  If you use -mips2, you may want to alter the default DCT method to
-be float.  To do this, put "#define JDCT_DEFAULT JDCT_FLOAT" in jconfig.h.
-
-
-VMS:
-
-On an Alpha/VMS system with MMS, be sure to use the "/Marco=Alpha=1"
-qualifier with MMS when building the JPEG package.
-
-VAX/VMS v5.5-1 may have problems with the test step of the build procedure
-reporting differences when it compares the original and test images.  If the
-error points to the last block of the files, it is most likely bogus and may
-be safely ignored.  It seems to be because the files are Stream_LF and
-Backup/Compare has difficulty with the (presumably) null padded files.
-This problem was not observed on VAX/VMS v6.1 or AXP/VMS v6.1.
diff --git a/jpeg/jaricom.c b/jpeg/jaricom.c
new file mode 100644
index 000000000000..2195a9733153
--- /dev/null
+++ b/jpeg/jaricom.c
@@ -0,0 +1,152 @@
+/*
+ * jaricom.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains probability estimation tables for common use in
+ * arithmetic entropy encoding and decoding routines.
+ *
+ * This data represents Table D.2 in the JPEG spec (ISO/IEC IS 10918-1
+ * and CCITT Recommendation ITU-T T.81) and Table 24 in the JBIG spec
+ * (ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82).
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+/* The following #define specifies the packing of the four components
+ * into the compact INT32 representation.
+ * Note that this formula must match the actual arithmetic encoder
+ * and decoder implementation.  The implementation has to be changed
+ * if this formula is changed.
+ * The current organization is leaned on Markus Kuhn's JBIG
+ * implementation (jbig_tab.c).
+ */
+
+#define V(i,a,b,c,d) (((INT32)a << 16) | ((INT32)c << 8) | ((INT32)d << 7) | b)
+
+const INT32 jpeg_aritab[113+1] = {
+/*
+ * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
+ */
+  V(   0, 0x5a1d,   1,   1, 1 ),
+  V(   1, 0x2586,  14,   2, 0 ),
+  V(   2, 0x1114,  16,   3, 0 ),
+  V(   3, 0x080b,  18,   4, 0 ),
+  V(   4, 0x03d8,  20,   5, 0 ),
+  V(   5, 0x01da,  23,   6, 0 ),
+  V(   6, 0x00e5,  25,   7, 0 ),
+  V(   7, 0x006f,  28,   8, 0 ),
+  V(   8, 0x0036,  30,   9, 0 ),
+  V(   9, 0x001a,  33,  10, 0 ),
+  V(  10, 0x000d,  35,  11, 0 ),
+  V(  11, 0x0006,   9,  12, 0 ),
+  V(  12, 0x0003,  10,  13, 0 ),
+  V(  13, 0x0001,  12,  13, 0 ),
+  V(  14, 0x5a7f,  15,  15, 1 ),
+  V(  15, 0x3f25,  36,  16, 0 ),
+  V(  16, 0x2cf2,  38,  17, 0 ),
+  V(  17, 0x207c,  39,  18, 0 ),
+  V(  18, 0x17b9,  40,  19, 0 ),
+  V(  19, 0x1182,  42,  20, 0 ),
+  V(  20, 0x0cef,  43,  21, 0 ),
+  V(  21, 0x09a1,  45,  22, 0 ),
+  V(  22, 0x072f,  46,  23, 0 ),
+  V(  23, 0x055c,  48,  24, 0 ),
+  V(  24, 0x0406,  49,  25, 0 ),
+  V(  25, 0x0303,  51,  26, 0 ),
+  V(  26, 0x0240,  52,  27, 0 ),
+  V(  27, 0x01b1,  54,  28, 0 ),
+  V(  28, 0x0144,  56,  29, 0 ),
+  V(  29, 0x00f5,  57,  30, 0 ),
+  V(  30, 0x00b7,  59,  31, 0 ),
+  V(  31, 0x008a,  60,  32, 0 ),
+  V(  32, 0x0068,  62,  33, 0 ),
+  V(  33, 0x004e,  63,  34, 0 ),
+  V(  34, 0x003b,  32,  35, 0 ),
+  V(  35, 0x002c,  33,   9, 0 ),
+  V(  36, 0x5ae1,  37,  37, 1 ),
+  V(  37, 0x484c,  64,  38, 0 ),
+  V(  38, 0x3a0d,  65,  39, 0 ),
+  V(  39, 0x2ef1,  67,  40, 0 ),
+  V(  40, 0x261f,  68,  41, 0 ),
+  V(  41, 0x1f33,  69,  42, 0 ),
+  V(  42, 0x19a8,  70,  43, 0 ),
+  V(  43, 0x1518,  72,  44, 0 ),
+  V(  44, 0x1177,  73,  45, 0 ),
+  V(  45, 0x0e74,  74,  46, 0 ),
+  V(  46, 0x0bfb,  75,  47, 0 ),
+  V(  47, 0x09f8,  77,  48, 0 ),
+  V(  48, 0x0861,  78,  49, 0 ),
+  V(  49, 0x0706,  79,  50, 0 ),
+  V(  50, 0x05cd,  48,  51, 0 ),
+  V(  51, 0x04de,  50,  52, 0 ),
+  V(  52, 0x040f,  50,  53, 0 ),
+  V(  53, 0x0363,  51,  54, 0 ),
+  V(  54, 0x02d4,  52,  55, 0 ),
+  V(  55, 0x025c,  53,  56, 0 ),
+  V(  56, 0x01f8,  54,  57, 0 ),
+  V(  57, 0x01a4,  55,  58, 0 ),
+  V(  58, 0x0160,  56,  59, 0 ),
+  V(  59, 0x0125,  57,  60, 0 ),
+  V(  60, 0x00f6,  58,  61, 0 ),
+  V(  61, 0x00cb,  59,  62, 0 ),
+  V(  62, 0x00ab,  61,  63, 0 ),
+  V(  63, 0x008f,  61,  32, 0 ),
+  V(  64, 0x5b12,  65,  65, 1 ),
+  V(  65, 0x4d04,  80,  66, 0 ),
+  V(  66, 0x412c,  81,  67, 0 ),
+  V(  67, 0x37d8,  82,  68, 0 ),
+  V(  68, 0x2fe8,  83,  69, 0 ),
+  V(  69, 0x293c,  84,  70, 0 ),
+  V(  70, 0x2379,  86,  71, 0 ),
+  V(  71, 0x1edf,  87,  72, 0 ),
+  V(  72, 0x1aa9,  87,  73, 0 ),
+  V(  73, 0x174e,  72,  74, 0 ),
+  V(  74, 0x1424,  72,  75, 0 ),
+  V(  75, 0x119c,  74,  76, 0 ),
+  V(  76, 0x0f6b,  74,  77, 0 ),
+  V(  77, 0x0d51,  75,  78, 0 ),
+  V(  78, 0x0bb6,  77,  79, 0 ),
+  V(  79, 0x0a40,  77,  48, 0 ),
+  V(  80, 0x5832,  80,  81, 1 ),
+  V(  81, 0x4d1c,  88,  82, 0 ),
+  V(  82, 0x438e,  89,  83, 0 ),
+  V(  83, 0x3bdd,  90,  84, 0 ),
+  V(  84, 0x34ee,  91,  85, 0 ),
+  V(  85, 0x2eae,  92,  86, 0 ),
+  V(  86, 0x299a,  93,  87, 0 ),
+  V(  87, 0x2516,  86,  71, 0 ),
+  V(  88, 0x5570,  88,  89, 1 ),
+  V(  89, 0x4ca9,  95,  90, 0 ),
+  V(  90, 0x44d9,  96,  91, 0 ),
+  V(  91, 0x3e22,  97,  92, 0 ),
+  V(  92, 0x3824,  99,  93, 0 ),
+  V(  93, 0x32b4,  99,  94, 0 ),
+  V(  94, 0x2e17,  93,  86, 0 ),
+  V(  95, 0x56a8,  95,  96, 1 ),
+  V(  96, 0x4f46, 101,  97, 0 ),
+  V(  97, 0x47e5, 102,  98, 0 ),
+  V(  98, 0x41cf, 103,  99, 0 ),
+  V(  99, 0x3c3d, 104, 100, 0 ),
+  V( 100, 0x375e,  99,  93, 0 ),
+  V( 101, 0x5231, 105, 102, 0 ),
+  V( 102, 0x4c0f, 106, 103, 0 ),
+  V( 103, 0x4639, 107, 104, 0 ),
+  V( 104, 0x415e, 103,  99, 0 ),
+  V( 105, 0x5627, 105, 106, 1 ),
+  V( 106, 0x50e7, 108, 107, 0 ),
+  V( 107, 0x4b85, 109, 103, 0 ),
+  V( 108, 0x5597, 110, 109, 0 ),
+  V( 109, 0x504f, 111, 107, 0 ),
+  V( 110, 0x5a10, 110, 111, 1 ),
+  V( 111, 0x5522, 112, 109, 0 ),
+  V( 112, 0x59eb, 112, 111, 1 ),
+/*
+ * This last entry is used for fixed probability estimate of 0.5
+ * as recommended in Section 10.3 Table 5 of ITU-T Rec. T.851.
+ */
+  V( 113, 0x5a1d, 113, 113, 0 )
diff --git a/jpeg/jcapimin.c b/jpeg/jcapimin.c
index 54fb8c58c565..20ba9e99bf32 100644
--- a/jpeg/jcapimin.c
+++ b/jpeg/jcapimin.c
@@ -2,6 +2,7 @@
  * jcapimin.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -63,14 +64,25 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
 
   cinfo->comp_info = NULL;
 
-  for (i = 0; i < NUM_QUANT_TBLS; i++)
+  for (i = 0; i < NUM_QUANT_TBLS; i++) {
     cinfo->quant_tbl_ptrs[i] = NULL;
+#if JPEG_LIB_VERSION >= 70
+    cinfo->q_scale_factor[i] = 100;
+#endif
+  }
 
   for (i = 0; i < NUM_HUFF_TBLS; i++) {
     cinfo->dc_huff_tbl_ptrs[i] = NULL;
     cinfo->ac_huff_tbl_ptrs[i] = NULL;
   }
 
+#if JPEG_LIB_VERSION >= 80
+  /* Must do it here for emit_dqt in case jpeg_write_tables is used */
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2-1;
+#endif
+
   cinfo->script_space = NULL;
 
   cinfo->input_gamma = 1.0;	/* in case application forgets */
diff --git a/jpeg/jcarith.c b/jpeg/jcarith.c
new file mode 100644
index 000000000000..a9ca1c338c62
--- /dev/null
+++ b/jpeg/jcarith.c
@@ -0,0 +1,925 @@
+/*
+ * jcarith.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy encoder object for arithmetic encoding. */
+
+typedef struct {
+  struct jpeg_entropy_encoder pub; /* public fields */
+
+  INT32 c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+  INT32 a;               /* A register, normalized size of coding interval */
+  INT32 sc;        /* counter for stacked 0xFF values which might overflow */
+  INT32 zc;          /* counter for pending 0x00 output values which might *
+                          * be discarded at the end ("Pacman" termination) */
+  int ct;  /* bit shift counter, determines when next byte will be written */
+  int buffer;                /* buffer for most recent output byte != 0xFF */
+
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  int next_restart_num;		/* next restart number to write (0-7) */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_encoder;
+
+typedef arith_entropy_encoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+/* NOTE: Uncomment the following #define if you want to use the
+ * given formula for calculating the AC conditioning parameter Kx
+ * for spectral selection progressive coding in section G.1.3.2
+ * of the spec (Kx = Kmin + SRL (8 + Se - Kmin) 4).
+ * Although the spec and P&M authors claim that this "has proven
+ * to give good results for 8 bit precision samples", I'm not
+ * convinced yet that this is really beneficial.
+ * Early tests gave only very marginal compression enhancements
+ * (a few - around 5 or so - bytes even for very large files),
+ * which would turn out rather negative if we'd suppress the
+ * DAC (Define Arithmetic Conditioning) marker segments for
+ * the default parameters in the future.
+ * Note that currently the marker writing module emits 12-byte
+ * DAC segments for a full-component scan in a color image.
+ * This is not worth worrying about IMHO. However, since the
+ * spec defines the default values to be used if the tables
+ * are omitted (unlike Huffman tables, which are required
+ * anyway), one might optimize this behaviour in the future,
+ * and then it would be disadvantageous to use custom tables if
+ * they don't provide sufficient gain to exceed the DAC size.
+ *
+ * On the other hand, I'd consider it as a reasonable result
+ * that the conditioning has no significant influence on the
+ * compression performance. This means that the basic
+ * statistical model is already rather stable.
+ *
+ * Thus, at the moment, we use the default conditioning values
+ * anyway, and do not use the custom formula.
+ *
+#define CALCULATE_SPECTRAL_CONDITIONING
+ */
+
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+	((ishift_temp = (x)) < 0 ? \
+	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+	 (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#endif
+
+
+LOCAL(void)
+emit_byte (int val, j_compress_ptr cinfo)
+/* Write next output byte; we do not support suspension in this module. */
+{
+  struct jpeg_destination_mgr * dest = cinfo->dest;
+
+  *dest->next_output_byte++ = (JOCTET) val;
+  if (--dest->free_in_buffer == 0)
+    if (! (*dest->empty_output_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+}
+
+
+/*
+ * Finish up at the end of an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  INT32 temp;
+
+  /* Section D.1.8: Termination of encoding */
+
+  /* Find the e->c in the coding interval with the largest
+   * number of trailing zero bits */
+  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+    e->c = temp + 0x8000L;
+  else
+    e->c = temp;
+  /* Send remaining bytes to output */
+  e->c <<= e->ct;
+  if (e->c & 0xF8000000L) {
+    /* One final overflow has to be handled */
+    if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer + 1, cinfo);
+      if (e->buffer + 1 == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+    e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+    e->sc = 0;
+  } else {
+    if (e->buffer == 0)
+      ++e->zc;
+    else if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer, cinfo);
+    }
+    if (e->sc) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      do {
+	emit_byte(0xFF, cinfo);
+	emit_byte(0x00, cinfo);
+      } while (--e->sc);
+    }
+  }
+  /* Output final bytes only if they are not 0x00 */
+  if (e->c & 0x7FFF800L) {
+    if (e->zc)  /* output final pending zero bytes */
+      do emit_byte(0x00, cinfo);
+      while (--e->zc);
+    emit_byte((e->c >> 19) & 0xFF, cinfo);
+    if (((e->c >> 19) & 0xFF) == 0xFF)
+      emit_byte(0x00, cinfo);
+    if (e->c & 0x7F800L) {
+      emit_byte((e->c >> 11) & 0xFF, cinfo);
+      if (((e->c >> 11) & 0xFF) == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+  }
+}
+
+
+/*
+ * The core arithmetic encoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Parameter 'val' to be encoded may be 0 or 1 (binary decision).
+ *
+ * Note: I've added full "Pacman" termination support to the
+ * byte output routines, which is equivalent to the optional
+ * Discard_final_zeros procedure (Figure D.15) in the spec.
+ * Thus, we always produce the shortest possible output
+ * stream compliant to the spec (no trailing zero bytes,
+ * except for FF stuffing).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(void)
+arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) 
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv;
+
+  /* Fetch values from our compact representation of Table D.2:
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
+  e->a -= qe;
+  if (val != (sv >> 7)) {
+    /* Encode the less probable symbol */
+    if (e->a >= qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency, otherwise code the LPS
+       * as usual: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+  } else {
+    /* Encode the more probable symbol */
+    if (e->a >= 0x8000L)
+      return;  /* A >= 0x8000 -> ready, no renormalization required */
+    if (e->a < qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+  }
+
+  /* Renormalization & data output per section D.1.6 */
+  do {
+    e->a <<= 1;
+    e->c <<= 1;
+    if (--e->ct == 0) {
+      /* Another byte is ready for output */
+      temp = e->c >> 19;
+      if (temp > 0xFF) {
+	/* Handle overflow over all stacked 0xFF bytes */
+	if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer + 1, cinfo);
+	  if (e->buffer + 1 == 0xFF)
+	    emit_byte(0x00, cinfo);
+	}
+	e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+	e->sc = 0;
+	/* Note: The 3 spacer bits in the C register guarantee
+	 * that the new buffer byte can't be 0xFF here
+	 * (see page 160 in the P&M JPEG book). */
+	e->buffer = temp & 0xFF;  /* new output byte, might overflow later */
+      } else if (temp == 0xFF) {
+	++e->sc;  /* stack 0xFF byte (which might overflow later) */
+      } else {
+	/* Output all stacked 0xFF bytes, they will not overflow any more */
+	if (e->buffer == 0)
+	  ++e->zc;
+	else if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer, cinfo);
+	}
+	if (e->sc) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  do {
+	    emit_byte(0xFF, cinfo);
+	    emit_byte(0x00, cinfo);
+	  } while (--e->sc);
+	}
+	e->buffer = temp & 0xFF;  /* new output byte (can still overflow) */
+      }
+      e->c &= 0x7FFFFL;
+      e->ct += 8;
+    }
+  } while (e->a < 0x8000L);
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(void)
+emit_restart (j_compress_ptr cinfo, int restart_num)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  finish_pass(cinfo);
+
+  emit_byte(0xFF, cinfo);
+  emit_byte(JPEG_RST0 + restart_num, cinfo);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->progressive_mode == 0 || cinfo->Se) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl;
+  int v, v2, m;
+  ISHIFT_TEMPS
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = m - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = m;
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke;
+  int v, v2, m;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Figure F.5: Encode_AC_Coefficients */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 0);		/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 0);
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 1);
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+    st += 2;
+    /* Figure F.8: Encoding the magnitude category of v */
+    m = 0;
+    if (v -= 1) {
+      arith_encode(cinfo, st, 1);
+      m = 1;
+      v2 = v;
+      if (v2 >>= 1) {
+	arith_encode(cinfo, st, 1);
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+    }
+    arith_encode(cinfo, st, 0);
+    /* Figure F.9: Encoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      arith_encode(cinfo, st, (m & v) ? 1 : 0);
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int Al, blkn;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  Al = cinfo->Al;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    arith_encode(cinfo, st, (MCU_data[blkn][0][0] >> Al) & 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke, kex;
+  int v;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Section G.1.3.3: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = ke; kex > 0; kex--)
+    if ((v = (*block)[jpeg_natural_order[kex]]) >= 0) {
+      if (v >>= cinfo->Ah) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Ah) break;
+    }
+
+  /* Figure G.10: Encode_AC_Coefficients_SA */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 0);
+	  }
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 1);
+	  }
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Encode and output one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, k, ke;
+  int v, v2, m;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = (*block)[0];
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+
+    /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+    tbl = compptr->ac_tbl_no;
+
+    /* Establish EOB (end-of-block) index */
+    for (ke = DCTSIZE2 - 1; ke > 0; ke--)
+      if ((*block)[jpeg_natural_order[ke]]) break;
+
+    /* Figure F.5: Encode_AC_Coefficients */
+    for (k = 1; k <= ke; k++) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+      while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
+	arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      }
+      arith_encode(cinfo, st + 1, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, entropy->fixed_bin, 0);
+      } else {
+	v = -v;
+	arith_encode(cinfo, entropy->fixed_bin, 1);
+      }
+      st += 2;
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	if (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (v2 >>= 1) {
+	    arith_encode(cinfo, st, 1);
+	    m <<= 1;
+	    st += 1;
+	  }
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+    /* Encode EOB decision only if k <= DCTSIZE2 - 1 */
+    if (k <= DCTSIZE2 - 1) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      arith_encode(cinfo, st, 1);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    /* Make sure to avoid that in the master control logic!
+     * We are fully adaptive here and need no extra
+     * statistics gathering pass!
+     */
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+
+  /* We assume jcmaster.c already validated the progressive scan parameters. */
+
+  /* Select execution routines */
+  if (cinfo->progressive_mode) {
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+    }
+  } else
+    entropy->pub.encode_mcu = encode_mcu;
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->progressive_mode == 0 || cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+#ifdef CALCULATE_SPECTRAL_CONDITIONING
+      if (cinfo->progressive_mode)
+	/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
+	cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+#endif
+    }
+  }
+
+  /* Initialize arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_arith_encoder (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_encoder));
+  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+  entropy->pub.finish_pass = finish_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+}
diff --git a/jpeg/jccolor.c b/jpeg/jccolor.c
index 0a8a4b5d13c3..73969901dcd8 100644
--- a/jpeg/jccolor.c
+++ b/jpeg/jccolor.c
@@ -2,6 +2,8 @@
  * jccolor.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -11,6 +13,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 
 /* Private subobject */
@@ -78,6 +81,74 @@ typedef my_color_converter * my_cconvert_ptr;
 #define TABLE_SIZE	(8*(MAXJSAMPLE+1))
 
 
+#if BITS_IN_JSAMPLE == 8
+
+static const unsigned char red_lut[256] = {
+  0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
+  5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 ,
+  10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14,
+  14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
+  19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24,
+  24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28,
+  29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33,
+  33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38,
+  38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43,
+  43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48,
+  48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52,
+  53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57,
+  57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62,
+  62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67,
+  67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71,
+  72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76
+};
+
+static const unsigned char green_lut[256] = {
+  0  , 1  , 1  , 2  , 2  , 3  , 4  , 4  , 5  , 5  , 6  , 6  ,
+  7  , 8  , 8  , 9  , 9  , 10 , 11 , 11 , 12 , 12 , 13 , 14 ,
+  14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 ,
+  21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 ,
+  28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 ,
+  35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 ,
+  42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 ,
+  49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 ,
+  56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 ,
+  63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 ,
+  70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 ,
+  77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 ,
+  85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 ,
+  92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 ,
+  99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105,
+  106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112,
+  113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119,
+  120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126,
+  127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133,
+  134, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140,
+  141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147,
+  148, 149, 149, 150
+};
+
+static const unsigned char blue_lut[256] = {
+  0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
+  2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 ,
+  4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
+  5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 ,
+  7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 ,
+  9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+  11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+  18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+  20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+  22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24,
+  24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+  26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
+  27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29
+};
+
+#endif
+
+
 /*
  * Initialize for RGB->YCC colorspace conversion.
  */
@@ -146,10 +217,10 @@ rgb_ycc_convert (j_compress_ptr cinfo,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
+      r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]);
+      g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]);
+      b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]);
+      inptr += rgb_pixelsize[cinfo->in_color_space];
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
        * Hence the value being shifted is never negative, and we don't
@@ -187,27 +258,35 @@ rgb_gray_convert (j_compress_ptr cinfo,
 		  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
 		  JDIMENSION output_row, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
+  #if BITS_IN_JSAMPLE != 8
   register INT32 * ctab = cconvert->rgb_ycc_tab;
+  #endif
   register JSAMPROW inptr;
   register JSAMPROW outptr;
-  register JDIMENSION col;
+  JSAMPLE *maxoutptr;
   JDIMENSION num_cols = cinfo->image_width;
+  int rindex = rgb_red[cinfo->in_color_space];
+  int gindex = rgb_green[cinfo->in_color_space];
+  int bindex = rgb_blue[cinfo->in_color_space];
+  int rgbstride = rgb_pixelsize[cinfo->in_color_space];
 
   while (--num_rows >= 0) {
     inptr = *input_buf++;
     outptr = output_buf[0][output_row];
+    maxoutptr = &outptr[num_cols];
     output_row++;
-    for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
+    for (; outptr < maxoutptr; outptr++, inptr += rgbstride) {
       /* Y */
-      outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+      #if BITS_IN_JSAMPLE == 8
+      *outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]]
+	    + blue_lut[inptr[bindex]];
+      #else
+      *outptr = (JSAMPLE)
+	    ((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF]
+	     + ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF]
+	     + ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF])
+	     >> SCALEBITS);
+      #endif
     }
   }
 }
@@ -368,11 +447,15 @@ jinit_color_converter (j_compress_ptr cinfo)
     break;
 
   case JCS_RGB:
-#if RGB_PIXELSIZE != 3
-    if (cinfo->input_components != RGB_PIXELSIZE)
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    if (cinfo->input_components != rgb_pixelsize[cinfo->in_color_space])
       ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     break;
-#endif /* else share code with YCbCr */
 
   case JCS_YCbCr:
     if (cinfo->input_components != 3)
@@ -398,7 +481,13 @@ jinit_color_converter (j_compress_ptr cinfo)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     if (cinfo->in_color_space == JCS_GRAYSCALE)
       cconvert->pub.color_convert = grayscale_convert;
-    else if (cinfo->in_color_space == JCS_RGB) {
+    else if (cinfo->in_color_space == JCS_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGBX ||
+             cinfo->in_color_space == JCS_EXT_BGR ||
+             cinfo->in_color_space == JCS_EXT_BGRX ||
+             cinfo->in_color_space == JCS_EXT_XBGR ||
+             cinfo->in_color_space == JCS_EXT_XRGB) {
       cconvert->pub.start_pass = rgb_ycc_start;
       cconvert->pub.color_convert = rgb_gray_convert;
     } else if (cinfo->in_color_space == JCS_YCbCr)
@@ -408,9 +497,16 @@ jinit_color_converter (j_compress_ptr cinfo)
     break;
 
   case JCS_RGB:
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
+    if (cinfo->in_color_space == cinfo->jpeg_color_space &&
+      rgb_pixelsize[cinfo->in_color_space] == 3)
       cconvert->pub.color_convert = null_convert;
     else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
@@ -419,9 +515,19 @@ jinit_color_converter (j_compress_ptr cinfo)
   case JCS_YCbCr:
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB) {
-      cconvert->pub.start_pass = rgb_ycc_start;
-      cconvert->pub.color_convert = rgb_ycc_convert;
+    if (cinfo->in_color_space == JCS_RGB ||
+        cinfo->in_color_space == JCS_EXT_RGB ||
+        cinfo->in_color_space == JCS_EXT_RGBX ||
+        cinfo->in_color_space == JCS_EXT_BGR ||
+        cinfo->in_color_space == JCS_EXT_BGRX ||
+        cinfo->in_color_space == JCS_EXT_XBGR ||
+        cinfo->in_color_space == JCS_EXT_XRGB) {
+      if (jsimd_can_rgb_ycc())
+        cconvert->pub.color_convert = jsimd_rgb_ycc_convert;
+      else {
+        cconvert->pub.start_pass = rgb_ycc_start;
+        cconvert->pub.color_convert = rgb_ycc_convert;
+      }
     } else if (cinfo->in_color_space == JCS_YCbCr)
       cconvert->pub.color_convert = null_convert;
     else
diff --git a/jpeg/jcdctmgr.c b/jpeg/jcdctmgr.c
index 61fa79b9e68b..711f9dab6290 100644
--- a/jpeg/jcdctmgr.c
+++ b/jpeg/jcdctmgr.c
@@ -2,6 +2,9 @@
  * jcdctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011 D. R. Commander
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -15,15 +18,37 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jsimddct.h"
 
 
 /* Private subobject for this module */
 
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+
+typedef JMETHOD(void, convsamp_method_ptr,
+                (JSAMPARRAY sample_data, JDIMENSION start_col,
+                 DCTELEM * workspace));
+typedef JMETHOD(void, float_convsamp_method_ptr,
+                (JSAMPARRAY sample_data, JDIMENSION start_col,
+                 FAST_FLOAT *workspace));
+
+typedef JMETHOD(void, quantize_method_ptr,
+                (JCOEFPTR coef_block, DCTELEM * divisors,
+                 DCTELEM * workspace));
+typedef JMETHOD(void, float_quantize_method_ptr,
+                (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                 FAST_FLOAT * workspace));
+
+METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
+
 typedef struct {
   struct jpeg_forward_dct pub;	/* public fields */
 
   /* Pointer to the DCT routine actually in use */
-  forward_DCT_method_ptr do_dct;
+  forward_DCT_method_ptr dct;
+  convsamp_method_ptr convsamp;
+  quantize_method_ptr quantize;
 
   /* The actual post-DCT divisors --- not identical to the quant table
    * entries, because of scaling (especially for an unnormalized DCT).
@@ -31,10 +56,16 @@ typedef struct {
    */
   DCTELEM * divisors[NUM_QUANT_TBLS];
 
+  /* work area for FDCT subroutine */
+  DCTELEM * workspace;
+
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
-  float_DCT_method_ptr do_float_dct;
+  float_DCT_method_ptr float_dct;
+  float_convsamp_method_ptr float_convsamp;
+  float_quantize_method_ptr float_quantize;
   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
+  FAST_FLOAT * float_workspace;
 #endif
 } my_fdct_controller;
 
@@ -42,6 +73,131 @@ typedef my_fdct_controller * my_fdct_ptr;
 
 
 /*
+ * Find the highest bit in an integer through binary search.
+ */
+LOCAL(int)
+flss (UINT16 val)
+{
+  int bit;
+
+  bit = 16;
+
+  if (!val)
+    return 0;
+
+  if (!(val & 0xff00)) {
+    bit -= 8;
+    val <<= 8;
+  }
+  if (!(val & 0xf000)) {
+    bit -= 4;
+    val <<= 4;
+  }
+  if (!(val & 0xc000)) {
+    bit -= 2;
+    val <<= 2;
+  }
+  if (!(val & 0x8000)) {
+    bit -= 1;
+    val <<= 1;
+  }
+
+  return bit;
+}
+
+/*
+ * Compute values to do a division using reciprocal.
+ *
+ * This implementation is based on an algorithm described in
+ *   "How to optimize for the Pentium family of microprocessors"
+ *   (http://www.agner.org/assem/).
+ * More information about the basic algorithm can be found in
+ * the paper "Integer Division Using Reciprocals" by Robert Alverson.
+ *
+ * The basic idea is to replace x/d by x * d^-1. In order to store
+ * d^-1 with enough precision we shift it left a few places. It turns
+ * out that this algoright gives just enough precision, and also fits
+ * into DCTELEM:
+ *
+ *   b = (the number of significant bits in divisor) - 1
+ *   r = (word size) + b
+ *   f = 2^r / divisor
+ *
+ * f will not be an integer for most cases, so we need to compensate
+ * for the rounding error introduced:
+ *
+ *   no fractional part:
+ *
+ *       result = input >> r
+ *
+ *   fractional part of f < 0.5:
+ *
+ *       round f down to nearest integer
+ *       result = ((input + 1) * f) >> r
+ *
+ *   fractional part of f > 0.5:
+ *
+ *       round f up to nearest integer
+ *       result = (input * f) >> r
+ *
+ * This is the original algorithm that gives truncated results. But we
+ * want properly rounded results, so we replace "input" with
+ * "input + divisor/2".
+ *
+ * In order to allow SIMD implementations we also tweak the values to
+ * allow the same calculation to be made at all times:
+ * 
+ *   dctbl[0] = f rounded to nearest integer
+ *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
+ *   dctbl[2] = 1 << ((word size) * 2 - r)
+ *   dctbl[3] = r - (word size)
+ *
+ * dctbl[2] is for stupid instruction sets where the shift operation
+ * isn't member wise (e.g. MMX).
+ *
+ * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
+ * is that most SIMD implementations have a "multiply and store top
+ * half" operation.
+ *
+ * Lastly, we store each of the values in their own table instead
+ * of in a consecutive manner, yet again in order to allow SIMD
+ * routines.
+ */
+LOCAL(int)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+{
+  UDCTELEM2 fq, fr;
+  UDCTELEM c;
+  int b, r;
+
+  b = flss(divisor) - 1;
+  r  = sizeof(DCTELEM) * 8 + b;
+
+  fq = ((UDCTELEM2)1 << r) / divisor;
+  fr = ((UDCTELEM2)1 << r) % divisor;
+
+  c = divisor / 2; /* for rounding */
+
+  if (fr == 0) { /* divisor is power of two */
+    /* fq will be one bit too large to fit in DCTELEM, so adjust */
+    fq >>= 1;
+    r--;
+  } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
+    c++;
+  } else { /* fractional part is > 0.5 */
+    fq++;
+  }
+
+  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+
+  if(r <= 16) return 0;
+  else return 1;
+}
+
+/*
  * Initialize for a processing pass.
  * Verify that all referenced Q-tables are present, and set up
  * the divisor table for each one.
@@ -78,11 +234,13 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
       if (fdct->divisors[qtblno] == NULL) {
 	fdct->divisors[qtblno] = (DCTELEM *)
 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      DCTSIZE2 * SIZEOF(DCTELEM));
+				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
       }
       dtbl = fdct->divisors[qtblno];
       for (i = 0; i < DCTSIZE2; i++) {
-	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+	if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
+	  && fdct->quantize == jsimd_quantize)
+	  fdct->quantize = quantize;
       }
       break;
 #endif
@@ -112,14 +270,16 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	if (fdct->divisors[qtblno] == NULL) {
 	  fdct->divisors[qtblno] = (DCTELEM *)
 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-					DCTSIZE2 * SIZEOF(DCTELEM));
+					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
 	}
 	dtbl = fdct->divisors[qtblno];
 	for (i = 0; i < DCTSIZE2; i++) {
-	  dtbl[i] = (DCTELEM)
+	  if(!compute_reciprocal(
 	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
 				  (INT32) aanscales[i]),
-		    CONST_BITS-3);
+		    CONST_BITS-3), &dtbl[i])
+	    && fdct->quantize == jsimd_quantize)
+	    fdct->quantize = quantize;
 	}
       }
       break;
@@ -169,6 +329,77 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 
 
 /*
+ * Load data into workspace, applying unsigned->signed conversion.
+ */
+
+METHODDEF(void)
+convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
+{
+  register DCTELEM *workspaceptr;
+  register JSAMPROW elemptr;
+  register int elemr;
+
+  workspaceptr = workspace;
+  for (elemr = 0; elemr < DCTSIZE; elemr++) {
+    elemptr = sample_data[elemr] + start_col;
+
+#if DCTSIZE == 8		/* unroll the inner loop */
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+#else
+    {
+      register int elemc;
+      for (elemc = DCTSIZE; elemc > 0; elemc--)
+        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    }
+#endif
+  }
+}
+
+
+/*
+ * Quantize/descale the coefficients, and store into coef_blocks[].
+ */
+
+METHODDEF(void)
+quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
+{
+  int i;
+  DCTELEM temp;
+  UDCTELEM recip, corr, shift;
+  UDCTELEM2 product;
+  JCOEFPTR output_ptr = coef_block;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    temp = workspace[i];
+    recip = divisors[i + DCTSIZE2 * 0];
+    corr =  divisors[i + DCTSIZE2 * 1];
+    shift = divisors[i + DCTSIZE2 * 3];
+
+    if (temp < 0) {
+      temp = -temp;
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
+      temp = -temp;
+    } else {
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
+    }
+
+    output_ptr[i] = (JCOEF) temp;
+  }
+}
+
+
+/*
  * Perform forward DCT on one or more blocks of a component.
  *
  * The input samples are taken from the sample_data[] array starting at
@@ -185,86 +416,86 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
   DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  DCTELEM * workspace;
   JDIMENSION bi;
 
+  /* Make sure the compiler doesn't look up these every pass */
+  forward_DCT_method_ptr do_dct = fdct->dct;
+  convsamp_method_ptr do_convsamp = fdct->convsamp;
+  quantize_method_ptr do_quantize = fdct->quantize;
+  workspace = fdct->workspace;
+
   sample_data += start_row;	/* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
+    (*do_convsamp) (sample_data, start_col, workspace);
 
     /* Perform the DCT */
     (*do_dct) (workspace);
 
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
+    (*do_quantize) (coef_blocks[bi], divisors, workspace);
+  }
+}
 
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+
+METHODDEF(void)
+convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
+{
+  register FAST_FLOAT *workspaceptr;
+  register JSAMPROW elemptr;
+  register int elemr;
+
+  workspaceptr = workspace;
+  for (elemr = 0; elemr < DCTSIZE; elemr++) {
+    elemptr = sample_data[elemr] + start_col;
+#if DCTSIZE == 8		/* unroll the inner loop */
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
 #else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
+    {
+      register int elemc;
+      for (elemc = DCTSIZE; elemc > 0; elemc--)
+        *workspaceptr++ = (FAST_FLOAT)
+                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
     }
+#endif
   }
 }
 
 
-#ifdef DCT_FLOAT_SUPPORTED
+METHODDEF(void)
+quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
+{
+  register FAST_FLOAT temp;
+  register int i;
+  register JCOEFPTR output_ptr = coef_block;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    /* Apply the quantization and scaling factor */
+    temp = workspace[i] * divisors[i];
+
+    /* Round to nearest integer.
+     * Since C does not specify the direction of rounding for negative
+     * quotients, we have to force the dividend positive for portability.
+     * The maximum coefficient size is +-16K (for 12-bit data), so this
+     * code should work for either 16-bit or 32-bit ints.
+     */
+    output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+  }
+}
+
 
 METHODDEF(void)
 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
@@ -275,62 +506,28 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
   FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  FAST_FLOAT * workspace;
   JDIMENSION bi;
 
+
+  /* Make sure the compiler doesn't look up these every pass */
+  float_DCT_method_ptr do_dct = fdct->float_dct;
+  float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
+  float_quantize_method_ptr do_quantize = fdct->float_quantize;
+  workspace = fdct->float_workspace;
+
   sample_data += start_row;	/* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
+    (*do_convsamp) (sample_data, start_col, workspace);
 
     /* Perform the DCT */
     (*do_dct) (workspace);
 
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
+    (*do_quantize) (coef_blocks[bi], divisors, workspace);
   }
 }
 
@@ -353,23 +550,33 @@ jinit_forward_dct (j_compress_ptr cinfo)
   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
+  /* First determine the DCT... */
   switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
   case JDCT_ISLOW:
     fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
+    if (jsimd_can_fdct_islow())
+      fdct->dct = jsimd_fdct_islow;
+    else
+      fdct->dct = jpeg_fdct_islow;
     break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
   case JDCT_IFAST:
     fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
+    if (jsimd_can_fdct_ifast())
+      fdct->dct = jsimd_fdct_ifast;
+    else
+      fdct->dct = jpeg_fdct_ifast;
     break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
   case JDCT_FLOAT:
     fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
+    if (jsimd_can_fdct_float())
+      fdct->float_dct = jsimd_fdct_float;
+    else
+      fdct->float_dct = jpeg_fdct_float;
     break;
 #endif
   default:
@@ -377,6 +584,54 @@ jinit_forward_dct (j_compress_ptr cinfo)
     break;
   }
 
+  /* ...then the supporting stages. */
+  switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+  case JDCT_ISLOW:
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  case JDCT_IFAST:
+#endif
+#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
+    if (jsimd_can_convsamp())
+      fdct->convsamp = jsimd_convsamp;
+    else
+      fdct->convsamp = convsamp;
+    if (jsimd_can_quantize())
+      fdct->quantize = jsimd_quantize;
+    else
+      fdct->quantize = quantize;
+    break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  case JDCT_FLOAT:
+    if (jsimd_can_convsamp_float())
+      fdct->float_convsamp = jsimd_convsamp_float;
+    else
+      fdct->float_convsamp = convsamp_float;
+    if (jsimd_can_quantize_float())
+      fdct->float_quantize = jsimd_quantize_float;
+    else
+      fdct->float_quantize = quantize_float;
+    break;
+#endif
+  default:
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+    break;
+  }
+
+  /* Allocate workspace memory */
+#ifdef DCT_FLOAT_SUPPORTED
+  if (cinfo->dct_method == JDCT_FLOAT)
+    fdct->float_workspace = (FAST_FLOAT *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(FAST_FLOAT) * DCTSIZE2);
+  else
+#endif
+    fdct->workspace = (DCTELEM *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(DCTELEM) * DCTSIZE2);
+
   /* Mark divisor tables unallocated */
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     fdct->divisors[i] = NULL;
diff --git a/jpeg/jchuff.c b/jpeg/jchuff.c
index f23525054867..f9fba90b6d42 100644
--- a/jpeg/jchuff.c
+++ b/jpeg/jchuff.c
@@ -19,7 +19,6 @@
 #include "jpeglib.h"
 #include "jchuff.h"		/* Declarations shared with jcphuff.c */
 
-
 /* Expanded entropy encoder object for Huffman encoding.
  *
  * The savable_state subrecord contains fields that change within an MCU,
diff --git a/jpeg/jcinit.c b/jpeg/jcinit.c
index 5efffe33166b..de0ade2a73e9 100644
--- a/jpeg/jcinit.c
+++ b/jpeg/jcinit.c
@@ -42,7 +42,11 @@ jinit_compress_master (j_compress_ptr cinfo)
   jinit_forward_dct(cinfo);
   /* Entropy encoding: either Huffman or arithmetic coding. */
   if (cinfo->arith_code) {
+#ifdef C_ARITH_CODING_SUPPORTED
+    jinit_arith_encoder(cinfo);
+#else
     ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
   } else {
     if (cinfo->progressive_mode) {
 #ifdef C_PROGRESSIVE_SUPPORTED
diff --git a/jpeg/jcmarker.c b/jpeg/jcmarker.c
index 2ae188136fbd..b1c1e4581e5c 100644
--- a/jpeg/jcmarker.c
+++ b/jpeg/jcmarker.c
@@ -2,6 +2,7 @@
  * jcmarker.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -11,6 +12,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 typedef enum {			/* JPEG marker codes */
@@ -75,7 +77,9 @@ typedef enum {			/* JPEG marker codes */
   M_JPG13 = 0xfd,
   M_COM   = 0xfe,
   
-  M_TEM   = 0x01
+  M_TEM   = 0x01,
+  
+  M_ERROR = 0x100
 } JPEG_MARKER;
 
 
@@ -103,7 +107,7 @@ typedef my_marker_writer * my_marker_ptr;
  */
 
 LOCAL(void)
-emit_byte (j_compress_ptr cinfo, int16 val)
+emit_byte (j_compress_ptr cinfo, int val)
 /* Emit a byte */
 {
   struct jpeg_destination_mgr * dest = cinfo->dest;
@@ -121,12 +125,12 @@ emit_marker (j_compress_ptr cinfo, JPEG_MARKER mark)
 /* Emit a marker code */
 {
   emit_byte(cinfo, 0xFF);
-  emit_byte(cinfo, (int16) mark);
+  emit_byte(cinfo, (int) mark);
 }
 
 
 LOCAL(void)
-emit_2bytes (j_compress_ptr cinfo, int16 value)
+emit_2bytes (j_compress_ptr cinfo, int value)
 /* Emit a 2-byte integer; these are always MSB first in JPEG files */
 {
   emit_byte(cinfo, (value >> 8) & 0xFF);
@@ -138,14 +142,14 @@ emit_2bytes (j_compress_ptr cinfo, int16 value)
  * Routines to write specific marker types.
  */
 
-LOCAL(int16)
-emit_dqt (j_compress_ptr cinfo, int16 index)
+LOCAL(int)
+emit_dqt (j_compress_ptr cinfo, int index)
 /* Emit a DQT marker */
 /* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
 {
   JQUANT_TBL * qtbl = cinfo->quant_tbl_ptrs[index];
-  int16 prec;
-  int16 i;
+  int prec;
+  int i;
 
   if (qtbl == NULL)
     ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, index);
@@ -167,8 +171,8 @@ emit_dqt (j_compress_ptr cinfo, int16 index)
       /* The table entries must be emitted in zigzag order. */
       unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
       if (prec)
-	emit_byte(cinfo, (int16) (qval >> 8));
-      emit_byte(cinfo, (int16) (qval & 0xFF));
+	emit_byte(cinfo, (int) (qval >> 8));
+      emit_byte(cinfo, (int) (qval & 0xFF));
     }
 
     qtbl->sent_table = TRUE;
@@ -179,11 +183,11 @@ emit_dqt (j_compress_ptr cinfo, int16 index)
 
 
 LOCAL(void)
-emit_dht (j_compress_ptr cinfo, int16 index, boolean is_ac)
+emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
 /* Emit a DHT marker */
 {
   JHUFF_TBL * htbl;
-  int16 length, i;
+  int length, i;
   
   if (is_ac) {
     htbl = cinfo->ac_huff_tbl_ptrs[index];
@@ -225,7 +229,7 @@ emit_dac (j_compress_ptr cinfo)
 #ifdef C_ARITH_CODING_SUPPORTED
   char dc_in_use[NUM_ARITH_TBLS];
   char ac_in_use[NUM_ARITH_TBLS];
-  int16 length, i;
+  int length, i;
   jpeg_component_info *compptr;
   
   for (i = 0; i < NUM_ARITH_TBLS; i++)
@@ -267,7 +271,7 @@ emit_dri (j_compress_ptr cinfo)
   
   emit_2bytes(cinfo, 4);	/* fixed length */
 
-  emit_2bytes(cinfo, (int16) cinfo->restart_interval);
+  emit_2bytes(cinfo, (int) cinfo->restart_interval);
 }
 
 
@@ -275,7 +279,7 @@ LOCAL(void)
 emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
 /* Emit a SOF marker */
 {
-  int16 ci;
+  int ci;
   jpeg_component_info *compptr;
   
   emit_marker(cinfo, code);
@@ -283,13 +287,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->image_height > 65535L ||
-      (long) cinfo->image_width > 65535L)
+  if ((long) cinfo->_jpeg_height > 65535L ||
+      (long) cinfo->_jpeg_width > 65535L)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
 
   emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int16) cinfo->image_height);
-  emit_2bytes(cinfo, (int16) cinfo->image_width);
+  emit_2bytes(cinfo, (int) cinfo->_jpeg_height);
+  emit_2bytes(cinfo, (int) cinfo->_jpeg_width);
 
   emit_byte(cinfo, cinfo->num_components);
 
@@ -306,7 +310,7 @@ LOCAL(void)
 emit_sos (j_compress_ptr cinfo)
 /* Emit a SOS marker */
 {
-  int16 i, td, ta;
+  int i, td, ta;
   jpeg_component_info *compptr;
   
   emit_marker(cinfo, M_SOS);
@@ -371,8 +375,8 @@ emit_jfif_app0 (j_compress_ptr cinfo)
   emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */
   emit_byte(cinfo, cinfo->JFIF_minor_version);
   emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
-  emit_2bytes(cinfo, (int16) cinfo->X_density);
-  emit_2bytes(cinfo, (int16) cinfo->Y_density);
+  emit_2bytes(cinfo, (int) cinfo->X_density);
+  emit_2bytes(cinfo, (int) cinfo->Y_density);
   emit_byte(cinfo, 0);		/* No thumbnail image */
   emit_byte(cinfo, 0);
 }
@@ -441,14 +445,14 @@ write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
 
   emit_marker(cinfo, (JPEG_MARKER) marker);
 
-  emit_2bytes(cinfo, (int16) (datalen + 2));	/* total length */
+  emit_2bytes(cinfo, (int) (datalen + 2));	/* total length */
 }
 
 METHODDEF(void)
 write_marker_byte (j_compress_ptr cinfo, int val)
 /* Emit one byte of marker parameters following write_marker_header */
 {
-  emit_byte(cinfo, (int16) val);
+  emit_byte(cinfo, val);
 }
 
 
@@ -491,7 +495,7 @@ write_file_header (j_compress_ptr cinfo)
 METHODDEF(void)
 write_frame_header (j_compress_ptr cinfo)
 {
-  int16 ci, prec;
+  int ci, prec;
   boolean is_baseline;
   jpeg_component_info *compptr;
   
@@ -549,7 +553,7 @@ METHODDEF(void)
 write_scan_header (j_compress_ptr cinfo)
 {
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
-  int16 i;
+  int i;
   jpeg_component_info *compptr;
 
   if (cinfo->arith_code) {
@@ -613,7 +617,7 @@ write_file_trailer (j_compress_ptr cinfo)
 METHODDEF(void)
 write_tables_only (j_compress_ptr cinfo)
 {
-  int16 i;
+  int i;
 
   emit_marker(cinfo, M_SOI);
 
diff --git a/jpeg/jcmaster.c b/jpeg/jcmaster.c
index aab4020b8796..74df5556ce2d 100644
--- a/jpeg/jcmaster.c
+++ b/jpeg/jcmaster.c
@@ -2,6 +2,8 @@
  * jcmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -14,6 +16,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /* Private state */
@@ -42,8 +45,28 @@ typedef my_comp_master * my_master_ptr;
  * Support routines that do various essential calculations.
  */
 
+#if JPEG_LIB_VERSION >= 70
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+  /* Hardwire it to "no scaling" */
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+}
+#endif
+
+
 LOCAL(void)
-initial_setup (j_compress_ptr cinfo)
+initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
 {
   int ci;
@@ -51,14 +74,19 @@ initial_setup (j_compress_ptr cinfo)
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
 
+#if JPEG_LIB_VERSION >= 70
+  if (!transcode_only)
+    jpeg_calc_jpeg_dimensions(cinfo);
+#endif
+
   /* Sanity check on image dimensions */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
+  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0
       || cinfo->num_components <= 0 || cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
+  if ((long) cinfo->_jpeg_height > (long) JPEG_MAX_DIMENSION ||
+      (long) cinfo->_jpeg_width > (long) JPEG_MAX_DIMENSION)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
 
   /* Width of an input scanline must be representable as JDIMENSION. */
@@ -96,20 +124,24 @@ initial_setup (j_compress_ptr cinfo)
     /* Fill in the correct component_index value; don't rely on application */
     compptr->component_index = ci;
     /* For compression, we never do DCT scaling. */
+#if JPEG_LIB_VERSION >= 70
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+#else
     compptr->DCT_scaled_size = DCTSIZE;
+#endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
 		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
 		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
 		    (long) cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
 		    (long) cinfo->max_v_samp_factor);
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
@@ -119,7 +151,7 @@ initial_setup (j_compress_ptr cinfo)
    * main controller will call coefficient controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
+    jdiv_round_up((long) cinfo->_jpeg_height,
 		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
 }
 
@@ -347,10 +379,10 @@ per_scan_setup (j_compress_ptr cinfo)
     
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
+      jdiv_round_up((long) cinfo->_jpeg_width,
 		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
+      jdiv_round_up((long) cinfo->_jpeg_height,
 		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
     
     cinfo->blocks_in_MCU = 0;
@@ -554,7 +586,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
   master->pub.is_last_pass = FALSE;
 
   /* Validate parameters, determine derived values */
-  initial_setup(cinfo);
+  initial_setup(cinfo, transcode_only);
 
   if (cinfo->scan_info != NULL) {
 #ifdef C_MULTISCAN_FILES_SUPPORTED
diff --git a/jpeg/jconfig-mac-cw.h b/jpeg/jconfig-mac-cw.h
deleted file mode 100644
index a0ef5529fbfc..000000000000
--- a/jpeg/jconfig-mac-cw.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* jconfig.h --- generated by ckconfig.c */
-/* see jconfig.doc for explanations */
-
-#define ALIGN_TYPE long /* memory alignment */
-#define NO_GETENV /* we do have the function, but it's dead */
-#ifdef __cplusplus
-#define INLINE inline /* we have them in C++ */
-#endif
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE	/* You may need this on non-Unix systems */
-#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-/* #define PROGRESS_REPORT */	/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jpeg/jconfig.doc b/jpeg/jconfig.doc
deleted file mode 100644
index c18d1c064b77..000000000000
--- a/jpeg/jconfig.doc
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * jconfig.doc
- *
- * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file documents the configuration options that are required to
- * customize the JPEG software for a particular system.
- *
- * The actual configuration options for a particular installation are stored
- * in jconfig.h.  On many machines, jconfig.h can be generated automatically
- * or copied from one of the "canned" jconfig files that we supply.  But if
- * you need to generate a jconfig.h file by hand, this file tells you how.
- *
- * DO NOT EDIT THIS FILE --- IT WON'T ACCOMPLISH ANYTHING.
- * EDIT A COPY NAMED JCONFIG.H.
- */
-
-
-/*
- * These symbols indicate the properties of your machine or compiler.
- * #define the symbol if yes, #undef it if no.
- */
-
-/* Does your compiler support function prototypes?
- * (If not, you also need to use ansi2knr, see install.doc)
- */
-#define HAVE_PROTOTYPES
-
-/* Does your compiler support the declaration "unsigned char" ?
- * How about "unsigned short" ?
- */
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-
-/* Define "void" as "char" if your compiler doesn't know about type void.
- * NOTE: be sure to define void such that "void *" represents the most general
- * pointer type, e.g., that returned by malloc().
- */
-/* #define void char */
-
-/* Define "const" as empty if your compiler doesn't know the "const" keyword.
- */
-/* #define const */
-
-/* Define this if an ordinary "char" type is unsigned.
- * If you're not sure, leaving it undefined will work at some cost in speed.
- * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
- */
-#undef CHAR_IS_UNSIGNED
-
-/* Define this if your system has an ANSI-conforming <stddef.h> file.
- */
-#define HAVE_STDDEF_H
-
-/* Define this if your system has an ANSI-conforming <stdlib.h> file.
- */
-#define HAVE_STDLIB_H
-
-/* Define this if your system does not have an ANSI/SysV <string.h>,
- * but does have a BSD-style <strings.h>.
- */
-#undef NEED_BSD_STRINGS
-
-/* Define this if your system does not provide typedef size_t in any of the
- * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
- * <sys/types.h> instead.
- */
-#undef NEED_SYS_TYPES_H
-
-/* For 80x86 machines, you need to define NEED_FAR_POINTERS,
- * unless you are using a large-data memory model or 80386 flat-memory mode.
- * On less brain-damaged CPUs this symbol must not be defined.
- * (Defining this symbol causes large data structures to be referenced through
- * "far" pointers and to be allocated with a special version of malloc.)
- */
-#undef NEED_FAR_POINTERS
-
-/* Define this if your linker needs global names to be unique in less
- * than the first 15 characters.
- */
-#undef NEED_SHORT_EXTERNAL_NAMES
-
-/* Although a real ANSI C compiler can deal perfectly well with pointers to
- * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
- * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
- * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
- * actually get "missing structure definition" warnings or errors while
- * compiling the JPEG code.
- */
-#undef INCOMPLETE_TYPES_BROKEN
-
-
-/*
- * The following options affect code selection within the JPEG library,
- * but they don't need to be visible to applications using the library.
- * To minimize application namespace pollution, the symbols won't be
- * defined unless JPEG_INTERNALS has been defined.
- */
-
-#ifdef JPEG_INTERNALS
-
-/* Define this if your compiler implements ">>" on signed values as a logical
- * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
- * which is the normal and rational definition.
- */
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-
-#endif /* JPEG_INTERNALS */
-
-
-/*
- * The remaining options do not affect the JPEG library proper,
- * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
- * Other applications can ignore these.
- */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-/* These defines indicate which image (non-JPEG) file formats are allowed. */
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-/* Define this if you want to name both input and output files on the command
- * line, rather than using stdout and optionally stdin.  You MUST do this if
- * your system can't cope with binary I/O to stdin/stdout.  See comments at
- * head of cjpeg.c or djpeg.c.
- */
-#undef TWO_FILE_COMMANDLINE
-
-/* Define this if your system needs explicit cleanup of temporary files.
- * This is crucial under MS-DOS, where the temporary "files" may be areas
- * of extended memory; on most other systems it's not as important.
- */
-#undef NEED_SIGNAL_CATCHER
-
-/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
- * This is necessary on systems that distinguish text files from binary files,
- * and is harmless on most systems that don't.  If you have one of the rare
- * systems that complains about the "b" spec, define this symbol.
- */
-#undef DONT_USE_B_MODE
-
-/* Define this if you want percent-done progress reports from cjpeg/djpeg.
- */
-#undef PROGRESS_REPORT
-
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jpeg/jconfig.h b/jpeg/jconfig.h
dissimilarity index 99%
index d181a5ec0cb0..4d3c49d37c93 100644
--- a/jpeg/jconfig.h
+++ b/jpeg/jconfig.h
@@ -1,107 +1,59 @@
-/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is mozilla.org code.
- *
- * The Initial Developer of the Original Code is
- * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s):
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** */
-
-/*
- * jconfig.h to configure the IJG JPEG library for the Mozilla/Netscape
- * environment.  Note that there are also Mozilla mods in jmorecfg.h.
- */
-
-/* We assume an ANSI C or C++ compilation environment */
-#define HAVE_PROTOTYPES 
-#define HAVE_UNSIGNED_CHAR 
-#define HAVE_UNSIGNED_SHORT 
-/* #define void char */
-/* #define const */
-#ifndef HAVE_STDDEF_H 
-#define HAVE_STDDEF_H 
-#endif /* HAVE_STDDEF_H */
-#ifndef HAVE_STDLIB_H
-#define HAVE_STDLIB_H 
-#endif /* HAVE_STDLIB_H */
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-/* Define this if you get warnings about undefined structures. */
-#undef INCOMPLETE_TYPES_BROKEN
-
-/* With this setting, the IJG code will work regardless of whether
- * type "char" is signed or unsigned.
- */
-#undef CHAR_IS_UNSIGNED
-
-
-/* defines that need not be visible to callers of the IJG library */
-
-#ifdef JPEG_INTERNALS
-
-/* If right shift of "long" quantities is unsigned on your machine,
- * you'll have to define this.  Fortunately few people should need it.
- */
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-
-/* these defines are not interesting for building just the IJG library,
- * but we leave 'em here anyway.
- */
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT
-
-#endif /* JPEG_CJPEG_DJPEG */
-
-/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
-
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define ALIGN16_const_vector_short(name) __declspec(align(16)) const short name[8]
-#define ALIGN16_const_vector_uchar(name) __declspec(align(16)) const unsigned char name[16]
-#else
-#define ALIGN16_const_vector_short(name) const short name[8] __attribute__ ((aligned (16)))
-#define ALIGN16_const_vector_uchar(name) const unsigned char name[16] __attribute__ ((aligned (16)))
-#endif /* ! XP_WIN32 && _M_IX86 && !__GNUC */
-
+/* jconfig.h.  Generated from jconfig.h.in by configure, then manually edited
+   for Mozilla. */
+
+/* Export libjpeg v6.2's ABI. */
+#define JPEG_LIB_VERSION 62
+
+/* Define if your compiler supports prototypes */
+#define HAVE_PROTOTYPES 1
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#define HAVE_UNSIGNED_CHAR 1
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#define HAVE_UNSIGNED_SHORT 1
+
+/* Define if you want use complete types */
+/* #define INCOMPLETE_TYPES_BROKEN 1 */
+
+/* Define if you have BSD-like bzero and bcopy */
+/* #undef NEED_BSD_STRINGS */
+
+/* Define if you need short function names */
+/* #undef NEED_SHORT_EXTERNAL_NAMES */
+
+/* Define if you have sys/types.h */
+#define NEED_SYS_TYPES_H 1
+
+/* Define if shift is unsigned */
+/* #undef RIGHT_SHIFT_IS_UNSIGNED */
+
+/* Use accelerated SIMD routines. */
+#define WITH_SIMD 1
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+/* # undef __CHAR_UNSIGNED__ */
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* MOZILLA CHANGE: libjpeg-turbo doesn't define INLINE in its config file, so
+ * we define it here. */
+#define INLINE NS_ALWAYS_INLINE
diff --git a/jpeg/jconfig.h.in b/jpeg/jconfig.h.in
new file mode 100644
index 000000000000..670afab53e43
--- /dev/null
+++ b/jpeg/jconfig.h.in
@@ -0,0 +1,60 @@
+/* Version ID for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ */
+#define JPEG_LIB_VERSION  62	/* Version 6b */
+
+/* Support arithmetic encoding */
+#undef C_ARITH_CODING_SUPPORTED
+
+/* Support arithmetic decoding */
+#undef D_ARITH_CODING_SUPPORTED
+
+/* Define if your compiler supports prototypes */
+#undef HAVE_PROTOTYPES
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#undef HAVE_STDDEF_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#undef HAVE_UNSIGNED_CHAR
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#undef HAVE_UNSIGNED_SHORT
+
+/* Define if you want use complete types */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define if you have BSD-like bzero and bcopy */
+#undef NEED_BSD_STRINGS
+
+/* Define if you need short function names */
+#undef NEED_SHORT_EXTERNAL_NAMES
+
+/* Define if you have sys/types.h */
+#undef NEED_SYS_TYPES_H
+
+/* Define if shift is unsigned */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+/* Use accelerated SIMD routines. */
+#undef WITH_SIMD
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+# undef __CHAR_UNSIGNED__
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff --git a/jpeg/jconfig.wat b/jpeg/jconfig.wat
deleted file mode 100644
index 6cc545baeead..000000000000
--- a/jpeg/jconfig.wat
+++ /dev/null
@@ -1,38 +0,0 @@
-/* jconfig.wat --- jconfig.h for Watcom C/C++ on MS-DOS or OS/2. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#define CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS	/* Watcom uses flat 32-bit addressing */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE	/* optional */
-#define USE_SETMODE		/* Needed to make one-file style work in Watcom */
-#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jpeg/jcparam.c b/jpeg/jcparam.c
index 324c475010dd..27b5a035b308 100644
--- a/jpeg/jcparam.c
+++ b/jpeg/jcparam.c
@@ -2,6 +2,8 @@
  * jcparam.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2008 by Guido Vollbeding.
+ * Copyright (C) 2009-2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -60,6 +62,49 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }
 
 
+/* These are the sample quantization tables given in JPEG spec section K.1.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+  16,  11,  10,  16,  24,  40,  51,  61,
+  12,  12,  14,  19,  26,  58,  60,  55,
+  14,  13,  16,  24,  40,  57,  69,  56,
+  14,  17,  22,  29,  51,  87,  80,  62,
+  18,  22,  37,  56,  68, 109, 103,  77,
+  24,  35,  55,  64,  81, 104, 113,  92,
+  49,  64,  78,  87, 103, 121, 120, 101,
+  72,  92,  95,  98, 112, 100, 103,  99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+  17,  18,  24,  47,  99,  99,  99,  99,
+  18,  21,  26,  66,  99,  99,  99,  99,
+  24,  26,  56,  99,  99,  99,  99,  99,
+  47,  66,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99
+};
+
+
+#if JPEG_LIB_VERSION >= 70
+GLOBAL(void)
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+  /* Set up two quantization tables using the specified scaling */
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+		       cinfo->q_scale_factor[0], force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+		       cinfo->q_scale_factor[1], force_baseline);
+}
+#endif
+
+
 GLOBAL(void)
 jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 			 boolean force_baseline)
@@ -69,35 +114,10 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
  * applications that insist on a linear percentage scaling.
  */
 {
-  /* These are the sample quantization tables given in JPEG spec section K.1.
-   * The spec says that the values given produce "good" quality, and
-   * when divided by 2, "very good" quality.
-   */
-  static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
-    16,  11,  10,  16,  24,  40,  51,  61,
-    12,  12,  14,  19,  26,  58,  60,  55,
-    14,  13,  16,  24,  40,  57,  69,  56,
-    14,  17,  22,  29,  51,  87,  80,  62,
-    18,  22,  37,  56,  68, 109, 103,  77,
-    24,  35,  55,  64,  81, 104, 113,  92,
-    49,  64,  78,  87, 103, 121, 120, 101,
-    72,  92,  95,  98, 112, 100, 103,  99
-  };
-  static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-    17,  18,  24,  47,  99,  99,  99,  99,
-    18,  21,  26,  66,  99,  99,  99,  99,
-    24,  26,  56,  99,  99,  99,  99,  99,
-    47,  66,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99
-  };
-
   /* Set up two quantization tables using the specified scaling */
-  jpeg_add_quant_table(cinfo, 0, (const unsigned int *)std_luminance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
 		       scale_factor, force_baseline);
-  jpeg_add_quant_table(cinfo, 1, (const unsigned int *)std_chrominance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
 		       scale_factor, force_baseline);
 }
 
@@ -284,6 +304,10 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 
   /* Initialize everything not dependent on the color space */
 
+#if JPEG_LIB_VERSION >= 70
+  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_denom = 1;
+#endif
   cinfo->data_precision = BITS_IN_JSAMPLE;
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
@@ -320,6 +344,11 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   /* By default, use the simpler non-cosited sampling alignment */
   cinfo->CCIR601_sampling = FALSE;
 
+#if JPEG_LIB_VERSION >= 70
+  /* By default, apply fancy downsampling */
+  cinfo->do_fancy_downsampling = TRUE;
+#endif
+
   /* No input smoothing */
   cinfo->smoothing_factor = 0;
 
@@ -363,6 +392,12 @@ jpeg_default_colorspace (j_compress_ptr cinfo)
     jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
     break;
   case JCS_RGB:
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
     jpeg_set_colorspace(cinfo, JCS_YCbCr);
     break;
   case JCS_YCbCr:
diff --git a/jpeg/jcphuff.c b/jpeg/jcphuff.c
index 07f9178b01c8..310287175ab4 100644
--- a/jpeg/jcphuff.c
+++ b/jpeg/jcphuff.c
@@ -223,7 +223,6 @@ dump_buffer (phuff_entropy_ptr entropy)
  * between calls, so 24 bits are sufficient.
  */
 
-INLINE
 LOCAL(void)
 emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
 /* Emit some bits, unless we are in gather mode */
@@ -276,7 +275,6 @@ flush_bits (phuff_entropy_ptr entropy)
  * Emit (or just count) a Huffman symbol.
  */
 
-INLINE
 LOCAL(void)
 emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
 {
diff --git a/jpeg/jcsample.c b/jpeg/jcsample.c
index 212ec8757c4c..eea376f90553 100644
--- a/jpeg/jcsample.c
+++ b/jpeg/jcsample.c
@@ -2,6 +2,7 @@
  * jcsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -48,6 +49,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 
 /* Pointer to routine to downsample a single component */
@@ -494,7 +496,10 @@ jinit_downsampler (j_compress_ptr cinfo)
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
       smoothok = FALSE;
-      downsample->methods[ci] = h2v1_downsample;
+      if (jsimd_can_h2v1_downsample())
+        downsample->methods[ci] = jsimd_h2v1_downsample;
+      else
+        downsample->methods[ci] = h2v1_downsample;
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
@@ -503,7 +508,10 @@ jinit_downsampler (j_compress_ptr cinfo)
 	downsample->pub.need_context_rows = TRUE;
       } else
 #endif
-	downsample->methods[ci] = h2v2_downsample;
+	if (jsimd_can_h2v2_downsample())
+	  downsample->methods[ci] = jsimd_h2v2_downsample;
+	else
+	  downsample->methods[ci] = h2v2_downsample;
     } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
 	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
       smoothok = FALSE;
diff --git a/jpeg/jdapimin.c b/jpeg/jdapimin.c
index 5b85f799d766..cadb59fce3aa 100644
--- a/jpeg/jdapimin.c
+++ b/jpeg/jdapimin.c
@@ -20,62 +20,6 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-#if _MSC_VER >= 1400
-#include "intrin.h"
-#else
-/* no __cpuid intrinsic, use a manually rewritten replacement */
-void __stdcall __cpuid( int CPUInfo[4], int InfoType )
-{
-  int my_eax = 0, my_ebx = 0, my_ecx = 0, my_edx = 0;
-  __asm {
-    /* check eflags bit 21 to see if cpuid is supported */
-    pushfd             /* save eflags to stack */
-    pop eax            /* and put it in eax */
-    mov ecx, eax       /* save a copy in ecx to compare against */
-    xor eax, 0x200000  /* toggle ID bit (bit 21) in eflags */
-    push eax           /* save modified eflags to stack */
-    popfd              /* set eflags register with modified value */
-    pushfd             /* read eflags back out */
-    pop eax
-    xor eax, ecx       /* check for modified eflags */
-    jz NOT_SUPPORTED   /* cpuid not supported */
-
-    /* check to see if the requested cpuid type is supported */
-    xor eax, eax       /* set eax to zero */
-    cpuid
-    cmp eax, InfoType
-    jl NOT_SUPPORTED   /* the requested cpuid type is not supported */
-
-    /* actually make the cpuid call */
-    mov eax, InfoType
-    cpuid
-    mov my_eax, eax
-    mov my_ebx, ebx
-    mov my_ecx, ecx
-    mov my_edx, edx
-NOT_SUPPORTED:
-  }
-  CPUInfo[0] = my_eax;
-  CPUInfo[1] = my_ebx;
-  CPUInfo[2] = my_ecx;
-  CPUInfo[3] = my_edx;
-}
-#endif /* _MSC_VER >= 1400 */
-
-int MMXAvailable;
-static int mmxsupport();
-#endif
-
-#ifdef HAVE_SSE2_INTRINSICS
-int SSE2Available = 0;
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-static int sse2support();
-#else
-static int sse2supportGCC();
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
-#endif /* HAVE_SSE2_INTRINSICS */
-
 
 /*
  * Initialization of a JPEG decompression object.
@@ -87,38 +31,6 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
 {
   int i;
 
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-  static int cpuidDetected = 0;
-
-  if(!cpuidDetected)
-  {
-	MMXAvailable = mmxsupport();
-
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-	/* only do the sse2 support check if mmx is supported (so
-	   we know the processor supports cpuid) */
-	if (MMXAvailable)
-	    SSE2Available = sse2support();
-#endif
-
-	cpuidDetected = 1;
-  }
-#else
-#ifdef HAVE_SSE2_INTRINSICS
-  static int cpuidDetected = 0;
-
-  if(!cpuidDetected) {
-    SSE2Available = sse2supportGCC();
-    cpuidDetected = 1;
-  }
-
-#endif /* HAVE_SSE2_INTRINSICS */
-#endif /* HAVE_MMX_INTEL_MNEMONICS */
-
-  /* For debugging purposes, zero the whole master structure.
-   * But error manager pointer is already there, so save and restore it.
-   */
-
   /* Guard against version mismatches between library and caller. */
   cinfo->mem = NULL;		/* so jpeg_destroy knows mem mgr not called */
   if (version != JPEG_LIB_VERSION)
@@ -193,6 +105,7 @@ jpeg_abort_decompress (j_decompress_ptr cinfo)
   jpeg_abort((j_common_ptr) cinfo); /* use common routine */
 }
 
+
 /*
  * Set default decompression parameters.
  */
@@ -480,51 +393,3 @@ jpeg_finish_decompress (j_decompress_ptr cinfo)
   jpeg_abort((j_common_ptr) cinfo);
   return TRUE;
 }
-
-
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-static int mmxsupport()
-{
-  int CPUInfo[4];
-
-  __cpuid(CPUInfo, 1);
-  if (CPUInfo[3] & (0x1 << 23))
-    return 1;
-  else
-    return 0;
-}
-#endif
-
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-static int sse2support()
-{
-  int CPUInfo[4];
-
-  __cpuid(CPUInfo, 1);
-  if (CPUInfo[3] & (0x1 << 26))
-    return 1;
-  else
-    return 2;
-}
-#else
-#ifdef HAVE_SSE2_INTRINSICS
-static int sse2supportGCC()
-{
-
-  /* Mac Intel started with Core Duo chips which have SSE2 Support */
-
-#if defined(__GNUC__) && defined(__i386__)
-#if defined(XP_MACOSX)
-  return 1;
-#endif /* XP_MACOSX */
-#endif /* GNUC && i386 */
-
-  /* Add checking for SSE2 support for other platforms here */
-
-  /* We don't have SSE2 intrinsics support */
-
-  return 2;
-}
-#endif /* HAVE_SSE2_INTRINSICS */
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
-
diff --git a/jpeg/jdapistd.c b/jpeg/jdapistd.c
index c8e3fa0c35d2..2343da5c30f6 100644
--- a/jpeg/jdapistd.c
+++ b/jpeg/jdapistd.c
@@ -2,6 +2,7 @@
  * jdapistd.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -17,6 +18,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /* Forward declarations */
@@ -202,7 +204,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
   }
 
   /* Verify that at least one iMCU row can be returned. */
-  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size;
+  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size;
   if (max_lines < lines_per_iMCU_row)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
diff --git a/jpeg/jdarith.c b/jpeg/jdarith.c
new file mode 100644
index 000000000000..d5567339ce6c
--- /dev/null
+++ b/jpeg/jdarith.c
@@ -0,0 +1,761 @@
+/*
+ * jdarith.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy decoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy decoder object for arithmetic decoding. */
+
+typedef struct {
+  struct jpeg_entropy_decoder pub; /* public fields */
+
+  INT32 c;       /* C register, base of coding interval + input bit buffer */
+  INT32 a;               /* A register, normalized size of coding interval */
+  int ct;     /* bit shift counter, # of bits left in bit buffer part of C */
+                                                         /* init: ct = -16 */
+                                                         /* run: ct = 0..7 */
+                                                         /* error: ct = -1 */
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_decoder;
+
+typedef arith_entropy_decoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+
+LOCAL(int)
+get_byte (j_decompress_ptr cinfo)
+/* Read next input byte; we do not support suspension in this module. */
+{
+  struct jpeg_source_mgr * src = cinfo->src;
+
+  if (src->bytes_in_buffer == 0)
+    if (! (*src->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  src->bytes_in_buffer--;
+  return GETJOCTET(*src->next_input_byte++);
+}
+
+
+/*
+ * The core arithmetic decoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Return value is 0 or 1 (binary decision).
+ *
+ * Note: I've changed the handling of the code base & bit
+ * buffer register C compared to other implementations
+ * based on the standards layout & procedures.
+ * While it also contains both the actual base of the
+ * coding interval (16 bits) and the next-bits buffer,
+ * the cut-point between these two parts is floating
+ * (instead of fixed) with the bit shift counter CT.
+ * Thus, we also need only one (variable instead of
+ * fixed size) shift for the LPS/MPS decision, and
+ * we can get away with any renormalization update
+ * of C (except for new data insertion, of course).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(int)
+arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv, data;
+
+  /* Renormalization & data input per section D.2.6 */
+  while (e->a < 0x8000L) {
+    if (--e->ct < 0) {
+      /* Need to fetch next data byte */
+      if (cinfo->unread_marker)
+	data = 0;		/* stuff zero data */
+      else {
+	data = get_byte(cinfo);	/* read next input byte */
+	if (data == 0xFF) {	/* zero stuff or marker code */
+	  do data = get_byte(cinfo);
+	  while (data == 0xFF);	/* swallow extra 0xFF bytes */
+	  if (data == 0)
+	    data = 0xFF;	/* discard stuffed zero byte */
+	  else {
+	    /* Note: Different from the Huffman decoder, hitting
+	     * a marker while processing the compressed data
+	     * segment is legal in arithmetic coding.
+	     * The convention is to supply zero data
+	     * then until decoding is complete.
+	     */
+	    cinfo->unread_marker = data;
+	    data = 0;
+	  }
+	}
+      }
+      e->c = (e->c << 8) | data; /* insert data into C register */
+      if ((e->ct += 8) < 0)	 /* update bit shift counter */
+	/* Need more initial bytes */
+	if (++e->ct == 0)
+	  /* Got 2 initial bytes -> re-init A and exit loop */
+	  e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
+    }
+    e->a <<= 1;
+  }
+
+  /* Fetch values from our compact representation of Table D.2:
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
+  temp = e->a - qe;
+  e->a = temp;
+  temp <<= e->ct;
+  if (e->c >= temp) {
+    e->c -= temp;
+    /* Conditional LPS (less probable symbol) exchange */
+    if (e->a < qe) {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    } else {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    }
+  } else if (e->a < 0x8000L) {
+    /* Conditional MPS (more probable symbol) exchange */
+    if (e->a < qe) {
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    } else {
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    }
+  }
+
+  return sv >> 7;
+}
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ */
+
+LOCAL(void)
+process_restart (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  /* Advance past the RSTn marker */
+  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if (! cinfo->progressive_mode || cinfo->Ss) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Reset restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Arithmetic MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * arithmetic-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
+    (*block)[0] = (JCOEF) (entropy->last_dc_val[ci] << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, sign, k;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+  /* Figure F.20: Decode_AC_coefficients */
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (arith_decode(cinfo, st)) break;		/* EOB flag */
+    while (arith_decode(cinfo, st + 1) == 0) {
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+    /* Figure F.21: Decoding nonzero value v */
+    /* Figure F.22: Decoding the sign of v */
+    sign = arith_decode(cinfo, entropy->fixed_bin);
+    st += 2;
+    /* Figure F.23: Decoding the magnitude category of v */
+    if ((m = arith_decode(cinfo, st)) != 0) {
+      if (arith_decode(cinfo, st)) {
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+    }
+    v = m;
+    /* Figure F.24: Decoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      if (arith_decode(cinfo, st)) v |= m;
+    v += 1; if (sign) v = -v;
+    /* Scale and output coefficient in natural (dezigzagged) order */
+    (*block)[jpeg_natural_order[k]] = (JCOEF) (v << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int p1, blkn;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    if (arith_decode(cinfo, st))
+      MCU_data[blkn][0][0] |= p1;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  unsigned char *st;
+  int tbl, k, kex;
+  int p1, m1;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+  m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = cinfo->Se; kex > 0; kex--)
+    if ((*block)[jpeg_natural_order[kex]]) break;
+
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+    for (;;) {
+      thiscoef = *block + jpeg_natural_order[k];
+      if (*thiscoef) {				/* previously nonzero coef */
+	if (arith_decode(cinfo, st + 2)) {
+	  if (*thiscoef < 0)
+	    *thiscoef += m1;
+	  else
+	    *thiscoef += p1;
+	}
+	break;
+      }
+      if (arith_decode(cinfo, st + 1)) {	/* newly nonzero coef */
+	if (arith_decode(cinfo, entropy->fixed_bin))
+	  *thiscoef = m1;
+	else
+	  *thiscoef = p1;
+	break;
+      }
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign, k;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+
+    /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+    tbl = compptr->ac_tbl_no;
+
+    /* Figure F.20: Decode_AC_coefficients */
+    for (k = 1; k <= DCTSIZE2 - 1; k++) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+      while (arith_decode(cinfo, st + 1) == 0) {
+	st += 3; k++;
+	if (k > DCTSIZE2 - 1) {
+	  WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	  entropy->ct = -1;			/* spectral overflow */
+	  return TRUE;
+	}
+      }
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, entropy->fixed_bin);
+      st += 2;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	if (arith_decode(cinfo, st)) {
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (arith_decode(cinfo, st)) {
+	    if ((m <<= 1) == 0x8000) {
+	      WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	      entropy->ct = -1;			/* magnitude overflow */
+	      return TRUE;
+	    }
+	    st += 1;
+	  }
+	}
+      }
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+	goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se > DCTSIZE2 - 1)
+	goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+	goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+	goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+	if (cinfo->Ah != expected)
+	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+	coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning.
+     */
+    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
+	(cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+    /* Select MCU decoding routine */
+    entropy->pub.decode_mcu = decode_mcu;
+  }
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if (! cinfo->progressive_mode || cinfo->Ss) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+    }
+  }
+
+  /* Initialize arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_arith_decoder (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_decoder));
+  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++) 
+      for (i = 0; i < DCTSIZE2; i++)
+	*coef_bit_ptr++ = -1;
+  }
+}
diff --git a/jpeg/jdatadst.c b/jpeg/jdatadst.c
index 1b54a25eb110..2f488696c3be 100644
--- a/jpeg/jdatadst.c
+++ b/jpeg/jdatadst.c
@@ -2,13 +2,14 @@
  * jdatadst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains compression data destination routines for the case of
- * emitting JPEG data to a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * destination manager.
+ * emitting JPEG data to memory or to a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different destination manager.
  * IMPORTANT: we assume that fwrite() will correctly transcribe an array of
  * JOCTETs into 8-bit-wide elements on external storage.  If char is wider
  * than 8 bits on your machine, you may need to do some tweaking.
@@ -19,6 +20,11 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+
 
 /* Expanded data destination object for stdio output */
 
@@ -34,6 +40,23 @@ typedef my_destination_mgr * my_dest_ptr;
 #define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */
 
 
+#if JPEG_LIB_VERSION >= 80
+/* Expanded data destination object for memory output */
+
+typedef struct {
+  struct jpeg_destination_mgr pub; /* public fields */
+
+  unsigned char ** outbuffer;	/* target buffer */
+  unsigned long * outsize;
+  unsigned char * newbuffer;	/* newly allocated buffer */
+  JOCTET * buffer;		/* start of buffer */
+  size_t bufsize;
+} my_mem_destination_mgr;
+
+typedef my_mem_destination_mgr * my_mem_dest_ptr;
+#endif
+
+
 /*
  * Initialize destination --- called by jpeg_start_compress
  * before any data is actually written.
@@ -53,6 +76,14 @@ init_destination (j_compress_ptr cinfo)
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
 }
 
+#if JPEG_LIB_VERSION >= 80
+METHODDEF(void)
+init_mem_destination (j_compress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+#endif
+
 
 /*
  * Empty the output buffer --- called whenever buffer fills up.
@@ -92,6 +123,38 @@ empty_output_buffer (j_compress_ptr cinfo)
   return TRUE;
 }
 
+#if JPEG_LIB_VERSION >= 80
+METHODDEF(boolean)
+empty_mem_output_buffer (j_compress_ptr cinfo)
+{
+  size_t nextsize;
+  JOCTET * nextbuffer;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  /* Try to allocate new buffer with double size */
+  nextsize = dest->bufsize * 2;
+  nextbuffer = malloc(nextsize);
+
+  if (nextbuffer == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+
+  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+
+  if (dest->newbuffer != NULL)
+    free(dest->newbuffer);
+
+  dest->newbuffer = nextbuffer;
+
+  dest->pub.next_output_byte = nextbuffer + dest->bufsize;
+  dest->pub.free_in_buffer = dest->bufsize;
+
+  dest->buffer = nextbuffer;
+  dest->bufsize = nextsize;
+
+  return TRUE;
+}
+#endif
+
 
 /*
  * Terminate destination --- called by jpeg_finish_compress
@@ -119,6 +182,17 @@ term_destination (j_compress_ptr cinfo)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 }
 
+#if JPEG_LIB_VERSION >= 80
+METHODDEF(void)
+term_mem_destination (j_compress_ptr cinfo)
+{
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  *dest->outbuffer = dest->buffer;
+  *dest->outsize = dest->bufsize - dest->pub.free_in_buffer;
+}
+#endif
+
 
 /*
  * Prepare for output to a stdio stream.
@@ -150,65 +224,54 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile)
   dest->outfile = outfile;
 }
 
-/*
- * term_destination_file_close --- called by jpeg_finish_compress
- * after all data has been written.  Usually needs to flush buffer.
- * also will need to close file
- * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding
- * application must deal with any cleanup that should happen even
- * for error exit.
- */
-
-METHODDEF(void)
-term_destination_file_close(j_compress_ptr cinfo)
-{
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
-  size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer;
-
-  /* Write any data remaining in the buffer */
-  if (datacount > 0) {
-    if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
-      ERREXIT(cinfo, JERR_FILE_WRITE);
-  }
-  fflush(dest->outfile);
-  
-  /* Make sure we wrote the output file OK */
-  if (ferror(dest->outfile))
-    ERREXIT(cinfo, JERR_FILE_WRITE);
-  else
-      fclose(dest->outfile);
-}
-
-
-
-
 
+#if JPEG_LIB_VERSION >= 80
 /*
- * Prepare for output to a file from a char *
- * The caller is responsible
- * for closing it after finishing compression.
+ * Prepare for output to a memory buffer.
+ * The caller may supply an own initial buffer with appropriate size.
+ * Otherwise, or when the actual data output exceeds the given size,
+ * the library adapts the buffer size as necessary.
+ * The standard library functions malloc/free are used for allocating
+ * larger memory, so the buffer is available to the application after
+ * finishing compression, and then the application is responsible for
+ * freeing the requested memory.
  */
 
 GLOBAL(void)
-jpeg_file_dest (j_compress_ptr cinfo, char * outfile)
+jpeg_mem_dest (j_compress_ptr cinfo,
+	       unsigned char ** outbuffer, unsigned long * outsize)
 {
-  my_dest_ptr dest;
+  my_mem_dest_ptr dest;
+
+  if (outbuffer == NULL || outsize == NULL)	/* sanity check */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* The destination object is made permanent so that multiple JPEG images
-   * can be written to the same file without re-executing jpeg_stdio_dest.
-   * This makes it dangerous to use this manager and a different destination
-   * manager serially with the same JPEG object, because their private object
-   * sizes may be different.  Caveat programmer.
+   * can be written to the same buffer without re-executing jpeg_mem_dest.
    */
   if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(my_destination_mgr));
+				  SIZEOF(my_mem_destination_mgr));
   }
 
-  dest = (my_dest_ptr) cinfo->dest;
-  dest->pub.init_destination = init_destination;
-  dest->pub.empty_output_buffer = empty_output_buffer;
-  dest->pub.term_destination = term_destination_file_close;
-  dest->outfile = fopen(outfile,"wb");
+  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest->pub.init_destination = init_mem_destination;
+  dest->pub.empty_output_buffer = empty_mem_output_buffer;
+  dest->pub.term_destination = term_mem_destination;
+  dest->outbuffer = outbuffer;
+  dest->outsize = outsize;
+  dest->newbuffer = NULL;
+
+  if (*outbuffer == NULL || *outsize == 0) {
+    /* Allocate initial buffer */
+    dest->newbuffer = *outbuffer = malloc(OUTPUT_BUF_SIZE);
+    if (dest->newbuffer == NULL)
+      ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+    *outsize = OUTPUT_BUF_SIZE;
+  }
+
+  dest->pub.next_output_byte = dest->buffer = *outbuffer;
+  dest->pub.free_in_buffer = dest->bufsize = *outsize;
 }
+#endif
diff --git a/jpeg/jdatasrc.c b/jpeg/jdatasrc.c
index edc752bf5d8c..7609f763963e 100644
--- a/jpeg/jdatasrc.c
+++ b/jpeg/jdatasrc.c
@@ -2,13 +2,14 @@
  * jdatasrc.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains decompression data source routines for the case of
- * reading JPEG data from a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * source manager.
+ * reading JPEG data from memory or from a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different source manager.
  * IMPORTANT: we assume that fread() will correctly transcribe an array of
  * JOCTETs from 8-bit-wide elements on external storage.  If char is wider
  * than 8 bits on your machine, you may need to do some tweaking.
@@ -52,6 +53,14 @@ init_source (j_decompress_ptr cinfo)
   src->start_of_file = TRUE;
 }
 
+#if JPEG_LIB_VERSION >= 80
+METHODDEF(void)
+init_mem_source (j_decompress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+#endif
+
 
 /*
  * Fill the input buffer --- called whenever buffer is emptied.
@@ -111,6 +120,28 @@ fill_input_buffer (j_decompress_ptr cinfo)
   return TRUE;
 }
 
+#if JPEG_LIB_VERSION >= 80
+METHODDEF(boolean)
+fill_mem_input_buffer (j_decompress_ptr cinfo)
+{
+  static JOCTET mybuffer[4];
+
+  /* The whole JPEG data is expected to reside in the supplied memory
+   * buffer, so any request for more data beyond the given buffer size
+   * is treated as an error.
+   */
+  WARNMS(cinfo, JWRN_JPEG_EOF);
+  /* Insert a fake EOI marker */
+  mybuffer[0] = (JOCTET) 0xFF;
+  mybuffer[1] = (JOCTET) JPEG_EOI;
+
+  cinfo->src->next_input_byte = mybuffer;
+  cinfo->src->bytes_in_buffer = 2;
+
+  return TRUE;
+}
+#endif
+
 
 /*
  * Skip data --- used to skip over a potentially large amount of
@@ -127,22 +158,22 @@ fill_input_buffer (j_decompress_ptr cinfo)
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  struct jpeg_source_mgr * src = cinfo->src;
 
   /* Just a dumb implementation for now.  Could use fseek() except
    * it doesn't work on pipes.  Not clear that being smart is worth
    * any trouble anyway --- large skips are infrequent.
    */
   if (num_bytes > 0) {
-    while (num_bytes > (long) src->pub.bytes_in_buffer) {
-      num_bytes -= (long) src->pub.bytes_in_buffer;
-      (void) fill_input_buffer(cinfo);
+    while (num_bytes > (long) src->bytes_in_buffer) {
+      num_bytes -= (long) src->bytes_in_buffer;
+      (void) (*src->fill_input_buffer) (cinfo);
       /* note we assume that fill_input_buffer will never return FALSE,
        * so suspension need not be handled.
        */
     }
-    src->pub.next_input_byte += (size_t) num_bytes;
-    src->pub.bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t) num_bytes;
+    src->bytes_in_buffer -= (size_t) num_bytes;
   }
 }
 
@@ -210,3 +241,40 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
   src->pub.bytes_in_buffer = 0; /* forces fill_input_buffer on first read */
   src->pub.next_input_byte = NULL; /* until buffer loaded */
 }
+
+
+#if JPEG_LIB_VERSION >= 80
+/*
+ * Prepare for input from a supplied memory buffer.
+ * The buffer must contain the whole JPEG data.
+ */
+
+GLOBAL(void)
+jpeg_mem_src (j_decompress_ptr cinfo,
+	      unsigned char * inbuffer, unsigned long insize)
+{
+  struct jpeg_source_mgr * src;
+
+  if (inbuffer == NULL || insize == 0)	/* Treat empty input as fatal error */
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+
+  /* The source object is made permanent so that a series of JPEG images
+   * can be read from the same buffer by calling jpeg_mem_src only before
+   * the first one.
+   */
+  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+    cinfo->src = (struct jpeg_source_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+				  SIZEOF(struct jpeg_source_mgr));
+  }
+
+  src = cinfo->src;
+  src->init_source = init_mem_source;
+  src->fill_input_buffer = fill_mem_input_buffer;
+  src->skip_input_data = skip_input_data;
+  src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
+  src->term_source = term_source;
+  src->bytes_in_buffer = (size_t) insize;
+  src->next_input_byte = (JOCTET *) inbuffer;
+}
+#endif
diff --git a/jpeg/jdcoefct.c b/jpeg/jdcoefct.c
index 4938d20fcb65..48a9fc6f8332 100644
--- a/jpeg/jdcoefct.c
+++ b/jpeg/jdcoefct.c
@@ -2,6 +2,7 @@
  * jdcoefct.c
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -17,6 +18,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 /* Block smoothing is only applicable for progressive JPEG, so: */
 #ifndef D_PROGRESSIVE_SUPPORTED
@@ -47,6 +49,9 @@ typedef struct {
    */
   JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
 
+  /* Temporary workspace for one MCU */
+  JCOEF * workspace;
+
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* In multi-pass modes, we need a virtual block array for each component. */
   jvirt_barray_ptr whole_image[MAX_COMPONENTS];
@@ -187,7 +192,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						    : compptr->last_col_width;
 	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->DCT_scaled_size;
+	  yoffset * compptr->_DCT_scaled_size;
 	start_col = MCU_col_num * compptr->MCU_sample_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (cinfo->input_iMCU_row < last_iMCU_row ||
@@ -197,11 +202,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      (*inverse_DCT) (cinfo, compptr,
 			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
 			      output_ptr, output_col);
-	      output_col += compptr->DCT_scaled_size;
+	      output_col += compptr->_DCT_scaled_size;
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_scaled_size;
+	  output_ptr += compptr->_DCT_scaled_size;
 	}
       }
     }
@@ -362,9 +367,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
 			output_ptr, output_col);
 	buffer_ptr++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->_DCT_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->_DCT_scaled_size;
     }
   }
 
@@ -471,13 +476,16 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
   boolean first_row, last_row;
-  JBLOCK workspace;
+  JCOEF * workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
   INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
   int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
   int Al, pred;
 
+  /* Keep a local variable to avoid looking it up more than once */
+  workspace = coef->workspace;
+
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
 	 ! cinfo->inputctl->eoi_reached) {
@@ -654,9 +662,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	DC4 = DC5; DC5 = DC6;
 	DC7 = DC8; DC8 = DC9;
 	buffer_ptr++, prev_block_row++, next_block_row++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->_DCT_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->_DCT_scaled_size;
     }
   }
 
@@ -733,4 +741,9 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     coef->pub.decompress_data = decompress_onepass;
     coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
   }
+
+  /* Allocate the workspace buffer */
+  coef->workspace = (JCOEF *)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                SIZEOF(JCOEF) * DCTSIZE2);
 }
diff --git a/jpeg/jdcolor.c b/jpeg/jdcolor.c
index 30720c420274..bc73b3f4621e 100644
--- a/jpeg/jdcolor.c
+++ b/jpeg/jdcolor.c
@@ -2,6 +2,8 @@
  * jdcolor.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -11,7 +13,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jconfig.h"
+#include "jsimd.h"
 
 
 /* Private subobject */
@@ -19,15 +21,11 @@
 typedef struct {
   struct jpeg_color_deconverter pub; /* public fields */
 
-  /* These fields are not needed anymore as these are now static tables */
-
-#if 0
   /* Private state for YCC->RGB conversion */
   int * Cr_r_tab;		/* => table for Cr to R conversion */
   int * Cb_b_tab;		/* => table for Cb to B conversion */
   INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
   INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
-#endif
 } my_color_deconverter;
 
 typedef my_color_deconverter * my_cconvert_ptr;
@@ -66,191 +64,6 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
 #define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
 
-/* Use static tables for color processing. */
-
-const int Cr_r_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-  0xffffff4dUL, 0xffffff4eUL, 0xffffff4fUL, 0xffffff51UL, 0xffffff52UL, 0xffffff54UL, 
-  0xffffff55UL, 0xffffff56UL, 0xffffff58UL, 0xffffff59UL, 0xffffff5bUL, 0xffffff5cUL, 
-  0xffffff5dUL, 0xffffff5fUL, 0xffffff60UL, 0xffffff62UL, 0xffffff63UL, 0xffffff64UL, 
-  0xffffff66UL, 0xffffff67UL, 0xffffff69UL, 0xffffff6aUL, 0xffffff6bUL, 0xffffff6dUL, 
-  0xffffff6eUL, 0xffffff70UL, 0xffffff71UL, 0xffffff72UL, 0xffffff74UL, 0xffffff75UL, 
-  0xffffff77UL, 0xffffff78UL, 0xffffff79UL, 0xffffff7bUL, 0xffffff7cUL, 0xffffff7eUL, 
-  0xffffff7fUL, 0xffffff80UL, 0xffffff82UL, 0xffffff83UL, 0xffffff85UL, 0xffffff86UL, 
-  0xffffff87UL, 0xffffff89UL, 0xffffff8aUL, 0xffffff8cUL, 0xffffff8dUL, 0xffffff8eUL, 
-  0xffffff90UL, 0xffffff91UL, 0xffffff93UL, 0xffffff94UL, 0xffffff95UL, 0xffffff97UL, 
-  0xffffff98UL, 0xffffff9aUL, 0xffffff9bUL, 0xffffff9cUL, 0xffffff9eUL, 0xffffff9fUL, 
-  0xffffffa1UL, 0xffffffa2UL, 0xffffffa3UL, 0xffffffa5UL, 0xffffffa6UL, 0xffffffa8UL, 
-  0xffffffa9UL, 0xffffffaaUL, 0xffffffacUL, 0xffffffadUL, 0xffffffafUL, 0xffffffb0UL, 
-  0xffffffb1UL, 0xffffffb3UL, 0xffffffb4UL, 0xffffffb6UL, 0xffffffb7UL, 0xffffffb8UL, 
-  0xffffffbaUL, 0xffffffbbUL, 0xffffffbdUL, 0xffffffbeUL, 0xffffffc0UL, 0xffffffc1UL, 
-  0xffffffc2UL, 0xffffffc4UL, 0xffffffc5UL, 0xffffffc7UL, 0xffffffc8UL, 0xffffffc9UL, 
-  0xffffffcbUL, 0xffffffccUL, 0xffffffceUL, 0xffffffcfUL, 0xffffffd0UL, 0xffffffd2UL, 
-  0xffffffd3UL, 0xffffffd5UL, 0xffffffd6UL, 0xffffffd7UL, 0xffffffd9UL, 0xffffffdaUL, 
-  0xffffffdcUL, 0xffffffddUL, 0xffffffdeUL, 0xffffffe0UL, 0xffffffe1UL, 0xffffffe3UL, 
-  0xffffffe4UL, 0xffffffe5UL, 0xffffffe7UL, 0xffffffe8UL, 0xffffffeaUL, 0xffffffebUL, 
-  0xffffffecUL, 0xffffffeeUL, 0xffffffefUL, 0xfffffff1UL, 0xfffffff2UL, 0xfffffff3UL, 
-  0xfffffff5UL, 0xfffffff6UL, 0xfffffff8UL, 0xfffffff9UL, 0xfffffffaUL, 0xfffffffcUL, 
-  0xfffffffdUL, 0xffffffffUL,       0x00UL,       0x01UL,       0x03UL,       0x04UL, 
-        0x06UL,       0x07UL,       0x08UL,       0x0aUL,       0x0bUL,       0x0dUL, 
-        0x0eUL,       0x0fUL,       0x11UL,       0x12UL,       0x14UL,       0x15UL, 
-        0x16UL,       0x18UL,       0x19UL,       0x1bUL,       0x1cUL,       0x1dUL, 
-        0x1fUL,       0x20UL,       0x22UL,       0x23UL,       0x24UL,       0x26UL, 
-        0x27UL,       0x29UL,       0x2aUL,       0x2bUL,       0x2dUL,       0x2eUL, 
-        0x30UL,       0x31UL,       0x32UL,       0x34UL,       0x35UL,       0x37UL, 
-        0x38UL,       0x39UL,       0x3bUL,       0x3cUL,       0x3eUL,       0x3fUL, 
-        0x40UL,       0x42UL,       0x43UL,       0x45UL,       0x46UL,       0x48UL, 
-        0x49UL,       0x4aUL,       0x4cUL,       0x4dUL,       0x4fUL,       0x50UL, 
-        0x51UL,       0x53UL,       0x54UL,       0x56UL,       0x57UL,       0x58UL, 
-        0x5aUL,       0x5bUL,       0x5dUL,       0x5eUL,       0x5fUL,       0x61UL, 
-        0x62UL,       0x64UL,       0x65UL,       0x66UL,       0x68UL,       0x69UL, 
-        0x6bUL,       0x6cUL,       0x6dUL,       0x6fUL,       0x70UL,       0x72UL, 
-        0x73UL,       0x74UL,       0x76UL,       0x77UL,       0x79UL,       0x7aUL, 
-        0x7bUL,       0x7dUL,       0x7eUL,       0x80UL,       0x81UL,       0x82UL, 
-        0x84UL,       0x85UL,       0x87UL,       0x88UL,       0x89UL,       0x8bUL, 
-        0x8cUL,       0x8eUL,       0x8fUL,       0x90UL,       0x92UL,       0x93UL, 
-        0x95UL,       0x96UL,       0x97UL,       0x99UL,       0x9aUL,       0x9cUL, 
-        0x9dUL,       0x9eUL,       0xa0UL,       0xa1UL,       0xa3UL,       0xa4UL, 
-        0xa5UL,       0xa7UL,       0xa8UL,       0xaaUL,       0xabUL,       0xacUL, 
-        0xaeUL,       0xafUL,       0xb1UL,       0xb2UL
-  };
-
-const int Cb_b_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-  0xffffff1dUL, 0xffffff1fUL, 0xffffff21UL, 0xffffff22UL, 0xffffff24UL, 0xffffff26UL, 
-  0xffffff28UL, 0xffffff2aUL, 0xffffff2bUL, 0xffffff2dUL, 0xffffff2fUL, 0xffffff31UL, 
-  0xffffff32UL, 0xffffff34UL, 0xffffff36UL, 0xffffff38UL, 0xffffff3aUL, 0xffffff3bUL, 
-  0xffffff3dUL, 0xffffff3fUL, 0xffffff41UL, 0xffffff42UL, 0xffffff44UL, 0xffffff46UL, 
-  0xffffff48UL, 0xffffff49UL, 0xffffff4bUL, 0xffffff4dUL, 0xffffff4fUL, 0xffffff51UL, 
-  0xffffff52UL, 0xffffff54UL, 0xffffff56UL, 0xffffff58UL, 0xffffff59UL, 0xffffff5bUL, 
-  0xffffff5dUL, 0xffffff5fUL, 0xffffff61UL, 0xffffff62UL, 0xffffff64UL, 0xffffff66UL, 
-  0xffffff68UL, 0xffffff69UL, 0xffffff6bUL, 0xffffff6dUL, 0xffffff6fUL, 0xffffff70UL, 
-  0xffffff72UL, 0xffffff74UL, 0xffffff76UL, 0xffffff78UL, 0xffffff79UL, 0xffffff7bUL, 
-  0xffffff7dUL, 0xffffff7fUL, 0xffffff80UL, 0xffffff82UL, 0xffffff84UL, 0xffffff86UL, 
-  0xffffff88UL, 0xffffff89UL, 0xffffff8bUL, 0xffffff8dUL, 0xffffff8fUL, 0xffffff90UL, 
-  0xffffff92UL, 0xffffff94UL, 0xffffff96UL, 0xffffff97UL, 0xffffff99UL, 0xffffff9bUL, 
-  0xffffff9dUL, 0xffffff9fUL, 0xffffffa0UL, 0xffffffa2UL, 0xffffffa4UL, 0xffffffa6UL, 
-  0xffffffa7UL, 0xffffffa9UL, 0xffffffabUL, 0xffffffadUL, 0xffffffaeUL, 0xffffffb0UL, 
-  0xffffffb2UL, 0xffffffb4UL, 0xffffffb6UL, 0xffffffb7UL, 0xffffffb9UL, 0xffffffbbUL, 
-  0xffffffbdUL, 0xffffffbeUL, 0xffffffc0UL, 0xffffffc2UL, 0xffffffc4UL, 0xffffffc6UL, 
-  0xffffffc7UL, 0xffffffc9UL, 0xffffffcbUL, 0xffffffcdUL, 0xffffffceUL, 0xffffffd0UL, 
-  0xffffffd2UL, 0xffffffd4UL, 0xffffffd5UL, 0xffffffd7UL, 0xffffffd9UL, 0xffffffdbUL, 
-  0xffffffddUL, 0xffffffdeUL, 0xffffffe0UL, 0xffffffe2UL, 0xffffffe4UL, 0xffffffe5UL, 
-  0xffffffe7UL, 0xffffffe9UL, 0xffffffebUL, 0xffffffedUL, 0xffffffeeUL, 0xfffffff0UL, 
-  0xfffffff2UL, 0xfffffff4UL, 0xfffffff5UL, 0xfffffff7UL, 0xfffffff9UL, 0xfffffffbUL, 
-  0xfffffffcUL, 0xfffffffeUL,       0x00UL,       0x02UL,       0x04UL,       0x05UL, 
-        0x07UL,       0x09UL,       0x0bUL,       0x0cUL,       0x0eUL,       0x10UL, 
-        0x12UL,       0x13UL,       0x15UL,       0x17UL,       0x19UL,       0x1bUL, 
-        0x1cUL,       0x1eUL,       0x20UL,       0x22UL,       0x23UL,       0x25UL, 
-        0x27UL,       0x29UL,       0x2bUL,       0x2cUL,       0x2eUL,       0x30UL, 
-        0x32UL,       0x33UL,       0x35UL,       0x37UL,       0x39UL,       0x3aUL, 
-        0x3cUL,       0x3eUL,       0x40UL,       0x42UL,       0x43UL,       0x45UL, 
-        0x47UL,       0x49UL,       0x4aUL,       0x4cUL,       0x4eUL,       0x50UL, 
-        0x52UL,       0x53UL,       0x55UL,       0x57UL,       0x59UL,       0x5aUL, 
-        0x5cUL,       0x5eUL,       0x60UL,       0x61UL,       0x63UL,       0x65UL, 
-        0x67UL,       0x69UL,       0x6aUL,       0x6cUL,       0x6eUL,       0x70UL, 
-        0x71UL,       0x73UL,       0x75UL,       0x77UL,       0x78UL,       0x7aUL, 
-        0x7cUL,       0x7eUL,       0x80UL,       0x81UL,       0x83UL,       0x85UL, 
-        0x87UL,       0x88UL,       0x8aUL,       0x8cUL,       0x8eUL,       0x90UL, 
-        0x91UL,       0x93UL,       0x95UL,       0x97UL,       0x98UL,       0x9aUL, 
-        0x9cUL,       0x9eUL,       0x9fUL,       0xa1UL,       0xa3UL,       0xa5UL, 
-        0xa7UL,       0xa8UL,       0xaaUL,       0xacUL,       0xaeUL,       0xafUL, 
-        0xb1UL,       0xb3UL,       0xb5UL,       0xb7UL,       0xb8UL,       0xbaUL, 
-        0xbcUL,       0xbeUL,       0xbfUL,       0xc1UL,       0xc3UL,       0xc5UL, 
-        0xc6UL,       0xc8UL,       0xcaUL,       0xccUL,       0xceUL,       0xcfUL, 
-        0xd1UL,       0xd3UL,       0xd5UL,       0xd6UL,       0xd8UL,       0xdaUL, 
-        0xdcUL,       0xdeUL,       0xdfUL,       0xe1UL
-  };
-
-const int Cr_g_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-    0x5b6900UL,   0x5ab22eUL,   0x59fb5cUL,   0x59448aUL,   0x588db8UL,   0x57d6e6UL, 
-    0x572014UL,   0x566942UL,   0x55b270UL,   0x54fb9eUL,   0x5444ccUL,   0x538dfaUL, 
-    0x52d728UL,   0x522056UL,   0x516984UL,   0x50b2b2UL,   0x4ffbe0UL,   0x4f450eUL, 
-    0x4e8e3cUL,   0x4dd76aUL,   0x4d2098UL,   0x4c69c6UL,   0x4bb2f4UL,   0x4afc22UL, 
-    0x4a4550UL,   0x498e7eUL,   0x48d7acUL,   0x4820daUL,   0x476a08UL,   0x46b336UL, 
-    0x45fc64UL,   0x454592UL,   0x448ec0UL,   0x43d7eeUL,   0x43211cUL,   0x426a4aUL, 
-    0x41b378UL,   0x40fca6UL,   0x4045d4UL,   0x3f8f02UL,   0x3ed830UL,   0x3e215eUL, 
-    0x3d6a8cUL,   0x3cb3baUL,   0x3bfce8UL,   0x3b4616UL,   0x3a8f44UL,   0x39d872UL, 
-    0x3921a0UL,   0x386aceUL,   0x37b3fcUL,   0x36fd2aUL,   0x364658UL,   0x358f86UL, 
-    0x34d8b4UL,   0x3421e2UL,   0x336b10UL,   0x32b43eUL,   0x31fd6cUL,   0x31469aUL, 
-    0x308fc8UL,   0x2fd8f6UL,   0x2f2224UL,   0x2e6b52UL,   0x2db480UL,   0x2cfdaeUL, 
-    0x2c46dcUL,   0x2b900aUL,   0x2ad938UL,   0x2a2266UL,   0x296b94UL,   0x28b4c2UL, 
-    0x27fdf0UL,   0x27471eUL,   0x26904cUL,   0x25d97aUL,   0x2522a8UL,   0x246bd6UL, 
-    0x23b504UL,   0x22fe32UL,   0x224760UL,   0x21908eUL,   0x20d9bcUL,   0x2022eaUL, 
-    0x1f6c18UL,   0x1eb546UL,   0x1dfe74UL,   0x1d47a2UL,   0x1c90d0UL,   0x1bd9feUL, 
-    0x1b232cUL,   0x1a6c5aUL,   0x19b588UL,   0x18feb6UL,   0x1847e4UL,   0x179112UL, 
-    0x16da40UL,   0x16236eUL,   0x156c9cUL,   0x14b5caUL,   0x13fef8UL,   0x134826UL, 
-    0x129154UL,   0x11da82UL,   0x1123b0UL,   0x106cdeUL,    0xfb60cUL,    0xeff3aUL, 
-     0xe4868UL,    0xd9196UL,    0xcdac4UL,    0xc23f2UL,    0xb6d20UL,    0xab64eUL, 
-     0x9ff7cUL,    0x948aaUL,    0x891d8UL,    0x7db06UL,    0x72434UL,    0x66d62UL, 
-     0x5b690UL,    0x4ffbeUL,    0x448ecUL,    0x3921aUL,    0x2db48UL,    0x22476UL, 
-     0x16da4UL,     0xb6d2UL,        0x0UL, 0xffff492eUL, 0xfffe925cUL, 0xfffddb8aUL, 
-  0xfffd24b8UL, 0xfffc6de6UL, 0xfffbb714UL, 0xfffb0042UL, 0xfffa4970UL, 0xfff9929eUL, 
-  0xfff8dbccUL, 0xfff824faUL, 0xfff76e28UL, 0xfff6b756UL, 0xfff60084UL, 0xfff549b2UL, 
-  0xfff492e0UL, 0xfff3dc0eUL, 0xfff3253cUL, 0xfff26e6aUL, 0xfff1b798UL, 0xfff100c6UL, 
-  0xfff049f4UL, 0xffef9322UL, 0xffeedc50UL, 0xffee257eUL, 0xffed6eacUL, 0xffecb7daUL, 
-  0xffec0108UL, 0xffeb4a36UL, 0xffea9364UL, 0xffe9dc92UL, 0xffe925c0UL, 0xffe86eeeUL, 
-  0xffe7b81cUL, 0xffe7014aUL, 0xffe64a78UL, 0xffe593a6UL, 0xffe4dcd4UL, 0xffe42602UL, 
-  0xffe36f30UL, 0xffe2b85eUL, 0xffe2018cUL, 0xffe14abaUL, 0xffe093e8UL, 0xffdfdd16UL, 
-  0xffdf2644UL, 0xffde6f72UL, 0xffddb8a0UL, 0xffdd01ceUL, 0xffdc4afcUL, 0xffdb942aUL, 
-  0xffdadd58UL, 0xffda2686UL, 0xffd96fb4UL, 0xffd8b8e2UL, 0xffd80210UL, 0xffd74b3eUL, 
-  0xffd6946cUL, 0xffd5dd9aUL, 0xffd526c8UL, 0xffd46ff6UL, 0xffd3b924UL, 0xffd30252UL, 
-  0xffd24b80UL, 0xffd194aeUL, 0xffd0dddcUL, 0xffd0270aUL, 0xffcf7038UL, 0xffceb966UL, 
-  0xffce0294UL, 0xffcd4bc2UL, 0xffcc94f0UL, 0xffcbde1eUL, 0xffcb274cUL, 0xffca707aUL, 
-  0xffc9b9a8UL, 0xffc902d6UL, 0xffc84c04UL, 0xffc79532UL, 0xffc6de60UL, 0xffc6278eUL, 
-  0xffc570bcUL, 0xffc4b9eaUL, 0xffc40318UL, 0xffc34c46UL, 0xffc29574UL, 0xffc1dea2UL, 
-  0xffc127d0UL, 0xffc070feUL, 0xffbfba2cUL, 0xffbf035aUL, 0xffbe4c88UL, 0xffbd95b6UL, 
-  0xffbcdee4UL, 0xffbc2812UL, 0xffbb7140UL, 0xffbaba6eUL, 0xffba039cUL, 0xffb94ccaUL, 
-  0xffb895f8UL, 0xffb7df26UL, 0xffb72854UL, 0xffb67182UL, 0xffb5bab0UL, 0xffb503deUL, 
-  0xffb44d0cUL, 0xffb3963aUL, 0xffb2df68UL, 0xffb22896UL, 0xffb171c4UL, 0xffb0baf2UL, 
-  0xffb00420UL, 0xffaf4d4eUL, 0xffae967cUL, 0xffaddfaaUL, 0xffad28d8UL, 0xffac7206UL, 
-  0xffabbb34UL, 0xffab0462UL, 0xffaa4d90UL, 0xffa996beUL, 0xffa8dfecUL, 0xffa8291aUL, 
-  0xffa77248UL, 0xffa6bb76UL, 0xffa604a4UL, 0xffa54dd2UL
- };
-
-const int Cb_g_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-    0x2c8d00UL,   0x2c34e6UL,   0x2bdcccUL,   0x2b84b2UL,   0x2b2c98UL,   0x2ad47eUL, 
-    0x2a7c64UL,   0x2a244aUL,   0x29cc30UL,   0x297416UL,   0x291bfcUL,   0x28c3e2UL, 
-    0x286bc8UL,   0x2813aeUL,   0x27bb94UL,   0x27637aUL,   0x270b60UL,   0x26b346UL, 
-    0x265b2cUL,   0x260312UL,   0x25aaf8UL,   0x2552deUL,   0x24fac4UL,   0x24a2aaUL, 
-    0x244a90UL,   0x23f276UL,   0x239a5cUL,   0x234242UL,   0x22ea28UL,   0x22920eUL, 
-    0x2239f4UL,   0x21e1daUL,   0x2189c0UL,   0x2131a6UL,   0x20d98cUL,   0x208172UL, 
-    0x202958UL,   0x1fd13eUL,   0x1f7924UL,   0x1f210aUL,   0x1ec8f0UL,   0x1e70d6UL, 
-    0x1e18bcUL,   0x1dc0a2UL,   0x1d6888UL,   0x1d106eUL,   0x1cb854UL,   0x1c603aUL, 
-    0x1c0820UL,   0x1bb006UL,   0x1b57ecUL,   0x1affd2UL,   0x1aa7b8UL,   0x1a4f9eUL, 
-    0x19f784UL,   0x199f6aUL,   0x194750UL,   0x18ef36UL,   0x18971cUL,   0x183f02UL, 
-    0x17e6e8UL,   0x178eceUL,   0x1736b4UL,   0x16de9aUL,   0x168680UL,   0x162e66UL, 
-    0x15d64cUL,   0x157e32UL,   0x152618UL,   0x14cdfeUL,   0x1475e4UL,   0x141dcaUL, 
-    0x13c5b0UL,   0x136d96UL,   0x13157cUL,   0x12bd62UL,   0x126548UL,   0x120d2eUL, 
-    0x11b514UL,   0x115cfaUL,   0x1104e0UL,   0x10acc6UL,   0x1054acUL,    0xffc92UL, 
-     0xfa478UL,    0xf4c5eUL,    0xef444UL,    0xe9c2aUL,    0xe4410UL,    0xdebf6UL, 
-     0xd93dcUL,    0xd3bc2UL,    0xce3a8UL,    0xc8b8eUL,    0xc3374UL,    0xbdb5aUL, 
-     0xb8340UL,    0xb2b26UL,    0xad30cUL,    0xa7af2UL,    0xa22d8UL,    0x9cabeUL, 
-     0x972a4UL,    0x91a8aUL,    0x8c270UL,    0x86a56UL,    0x8123cUL,    0x7ba22UL, 
-     0x76208UL,    0x709eeUL,    0x6b1d4UL,    0x659baUL,    0x601a0UL,    0x5a986UL, 
-     0x5516cUL,    0x4f952UL,    0x4a138UL,    0x4491eUL,    0x3f104UL,    0x398eaUL, 
-     0x340d0UL,    0x2e8b6UL,    0x2909cUL,    0x23882UL,    0x1e068UL,    0x1884eUL, 
-     0x13034UL,     0xd81aUL,     0x8000UL,     0x27e6UL, 0xffffcfccUL, 0xffff77b2UL,
-  0xffff1f98UL, 0xfffec77eUL, 0xfffe6f64UL, 0xfffe174aUL, 0xfffdbf30UL, 0xfffd6716UL,
-  0xfffd0efcUL, 0xfffcb6e2UL, 0xfffc5ec8UL, 0xfffc06aeUL, 0xfffbae94UL, 0xfffb567aUL,
-  0xfffafe60UL, 0xfffaa646UL, 0xfffa4e2cUL, 0xfff9f612UL, 0xfff99df8UL, 0xfff945deUL,
-  0xfff8edc4UL, 0xfff895aaUL, 0xfff83d90UL, 0xfff7e576UL, 0xfff78d5cUL, 0xfff73542UL,
-  0xfff6dd28UL, 0xfff6850eUL, 0xfff62cf4UL, 0xfff5d4daUL, 0xfff57cc0UL, 0xfff524a6UL,
-  0xfff4cc8cUL, 0xfff47472UL, 0xfff41c58UL, 0xfff3c43eUL, 0xfff36c24UL, 0xfff3140aUL,
-  0xfff2bbf0UL, 0xfff263d6UL, 0xfff20bbcUL, 0xfff1b3a2UL, 0xfff15b88UL, 0xfff1036eUL,
-  0xfff0ab54UL, 0xfff0533aUL, 0xffeffb20UL, 0xffefa306UL, 0xffef4aecUL, 0xffeef2d2UL,
-  0xffee9ab8UL, 0xffee429eUL, 0xffedea84UL, 0xffed926aUL, 0xffed3a50UL, 0xffece236UL,
-  0xffec8a1cUL, 0xffec3202UL, 0xffebd9e8UL, 0xffeb81ceUL, 0xffeb29b4UL, 0xffead19aUL,
-  0xffea7980UL, 0xffea2166UL, 0xffe9c94cUL, 0xffe97132UL, 0xffe91918UL, 0xffe8c0feUL,
-  0xffe868e4UL, 0xffe810caUL, 0xffe7b8b0UL, 0xffe76096UL, 0xffe7087cUL, 0xffe6b062UL,
-  0xffe65848UL, 0xffe6002eUL, 0xffe5a814UL, 0xffe54ffaUL, 0xffe4f7e0UL, 0xffe49fc6UL,
-  0xffe447acUL, 0xffe3ef92UL, 0xffe39778UL, 0xffe33f5eUL, 0xffe2e744UL, 0xffe28f2aUL,
-  0xffe23710UL, 0xffe1def6UL, 0xffe186dcUL, 0xffe12ec2UL, 0xffe0d6a8UL, 0xffe07e8eUL,
-  0xffe02674UL, 0xffdfce5aUL, 0xffdf7640UL, 0xffdf1e26UL, 0xffdec60cUL, 0xffde6df2UL,
-  0xffde15d8UL, 0xffddbdbeUL, 0xffdd65a4UL, 0xffdd0d8aUL, 0xffdcb570UL, 0xffdc5d56UL,
-  0xffdc053cUL, 0xffdbad22UL, 0xffdb5508UL, 0xffdafceeUL, 0xffdaa4d4UL, 0xffda4cbaUL,
-  0xffd9f4a0UL, 0xffd99c86UL, 0xffd9446cUL, 0xffd8ec52UL, 0xffd89438UL, 0xffd83c1eUL,
-  0xffd7e404UL, 0xffd78beaUL, 0xffd733d0UL, 0xffd6dbb6UL, 0xffd6839cUL, 0xffd62b82UL,
-  0xffd5d368UL, 0xffd57b4eUL, 0xffd52334UL, 0xffd4cb1aUL
- };
 
 /*
  * Initialize tables for YCC->RGB colorspace conversion.
@@ -259,10 +72,6 @@ const int Cb_g_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
 LOCAL(void)
 build_ycc_rgb_table (j_decompress_ptr cinfo)
 {
-
-  /* The code below was used to generate the static tables above */
-
-#if 0
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   int i;
   INT32 x;
@@ -296,7 +105,6 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
     /* We also add in ONE_HALF so that need not do it in inner loop */
     cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
   }
-#endif /* 0 */
 }
 
 
@@ -318,12 +126,16 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int y, cb, cr;
-  JSAMPLE * range_limit_y;
-  JSAMPROW outptr;
-  JSAMPROW inptr0, inptr1, inptr2;
-  JDIMENSION col;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
-  JSAMPLE * range_limit = cinfo->sample_range_limit;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register int * Crrtab = cconvert->Cr_r_tab;
+  register int * Cbbtab = cconvert->Cb_b_tab;
+  register INT32 * Crgtab = cconvert->Cr_g_tab;
+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -336,14 +148,13 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
       y  = GETJSAMPLE(inptr0[col]);
       cb = GETJSAMPLE(inptr1[col]);
       cr = GETJSAMPLE(inptr2[col]);
-      range_limit_y = range_limit + y;
       /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[RGB_RED] =   range_limit_y[Cr_r_tab[cr]];
-      outptr[RGB_GREEN] = range_limit_y[
-			      ((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr],
+      outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + Crrtab[cr]];
+      outptr[rgb_green[cinfo->out_color_space]] = range_limit[y +
+			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
 						 SCALEBITS))];
-      outptr[RGB_BLUE] =  range_limit_y[Cb_b_tab[cb]];
-      outptr += RGB_PIXELSIZE;
+      outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + Cbbtab[cb]];
+      outptr += rgb_pixelsize[cinfo->out_color_space];
     }
   }
 }
@@ -411,16 +222,20 @@ gray_rgb_convert (j_decompress_ptr cinfo,
 		  JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW inptr, outptr;
-  register JDIMENSION col;
+  JSAMPLE *maxinptr;
   JDIMENSION num_cols = cinfo->output_width;
+  int rindex = rgb_red[cinfo->out_color_space];
+  int gindex = rgb_green[cinfo->out_color_space];
+  int bindex = rgb_blue[cinfo->out_color_space];
+  int rgbstride = rgb_pixelsize[cinfo->out_color_space];
 
   while (--num_rows >= 0) {
     inptr = input_buf[0][input_row++];
+    maxinptr = &inptr[num_cols];
     outptr = *output_buf++;
-    for (col = 0; col < num_cols; col++) {
+    for (; inptr < maxinptr; inptr++, outptr += rgbstride) {
       /* We can dispense with GETJSAMPLE() here */
-      outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
-      outptr += RGB_PIXELSIZE;
+      outptr[rindex] = outptr[gindex] = outptr[bindex] = *inptr;
     }
   }
 }
@@ -430,6 +245,7 @@ gray_rgb_convert (j_decompress_ptr cinfo,
  * Adobe-style YCCK->CMYK conversion.
  * We convert YCbCr to R=1-C, G=1-M, and B=1-Y using the same
  * conversion as above, while passing K (black) unchanged.
+ * We assume build_ycc_rgb_table has been called.
  */
 
 METHODDEF(void)
@@ -445,6 +261,10 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register int * Crrtab = cconvert->Cr_r_tab;
+  register int * Cbbtab = cconvert->Cb_b_tab;
+  register INT32 * Crgtab = cconvert->Cr_g_tab;
+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -459,11 +279,11 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
       cb = GETJSAMPLE(inptr1[col]);
       cr = GETJSAMPLE(inptr2[col]);
       /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[0] = range_limit[MAXJSAMPLE - (y + Cr_r_tab[cr])];   /* red */
-      outptr[1] = range_limit[MAXJSAMPLE - (y +                   /* green */
-				  ((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr],
-                         SCALEBITS)))];
-      outptr[2] = range_limit[MAXJSAMPLE - (y + Cb_b_tab[cb])];   /* blue */
+      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];	/* red */
+      outptr[1] = range_limit[MAXJSAMPLE - (y +			/* green */
+			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+						 SCALEBITS)))];
+      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];	/* blue */
       /* K passes through unchanged */
       outptr[3] = inptr3[col];	/* don't need GETJSAMPLE here */
       outptr += 4;
@@ -543,13 +363,24 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
     break;
 
   case JCS_RGB:
-    cinfo->out_color_components = RGB_PIXELSIZE;
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
     if (cinfo->jpeg_color_space == JCS_YCbCr) {
-      cconvert->pub.color_convert = ycc_rgb_convert;
-      build_ycc_rgb_table(cinfo);
+      if (jsimd_can_ycc_rgb())
+        cconvert->pub.color_convert = jsimd_ycc_rgb_convert;
+      else {
+        cconvert->pub.color_convert = ycc_rgb_convert;
+        build_ycc_rgb_table(cinfo);
+      }
     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
       cconvert->pub.color_convert = gray_rgb_convert;
-    } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
+    } else if (cinfo->jpeg_color_space == cinfo->out_color_space &&
+      rgb_pixelsize[cinfo->out_color_space] == 3) {
       cconvert->pub.color_convert = null_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
diff --git a/jpeg/jdct.h b/jpeg/jdct.h
index 04192a266ae1..7b49a97526d0 100644
--- a/jpeg/jdct.h
+++ b/jpeg/jdct.h
@@ -23,18 +23,26 @@
  * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
  * convention improves accuracy in integer implementations and saves some
  * work in floating-point ones.
- * Quantization of the output coefficients is done by jcdctmgr.c.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
  */
 
 #if BITS_IN_JSAMPLE == 8
+#ifndef WITH_SIMD
 typedef int DCTELEM;		/* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
+#else
+typedef short DCTELEM;  /* prefer 16 bit with SIMD for parellelism */
+typedef unsigned short UDCTELEM;
+typedef unsigned int UDCTELEM2;
+#endif
 #else
 typedef INT32 DCTELEM;		/* must have 32 bits */
+typedef UINT32 UDCTELEM;
+typedef unsigned long long UDCTELEM2;
 #endif
 
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
-
 
 /*
  * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer
diff --git a/jpeg/jddctmgr.c b/jpeg/jddctmgr.c
index 3a0e8fd399a1..044e46947749 100644
--- a/jpeg/jddctmgr.c
+++ b/jpeg/jddctmgr.c
@@ -2,6 +2,8 @@
  * jddctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -19,9 +21,9 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
-#ifdef HAVE_SSE2_INTRINSICS
-extern int SSE2Available;
-#endif
+#include "jsimddct.h"
+#include "jpegcomp.h"
+
 
 /*
  * The decompressor input side (jdinput.c) saves away the appropriate
@@ -80,14 +82,6 @@ typedef union {
 #endif
 #endif
 
-GLOBAL(void)
-jpeg_idct_islow_sse2 (
-	j_decompress_ptr cinfo, 
-	jpeg_component_info * compptr,
-	JCOEFPTR coef_block,
-	JSAMPARRAY output_buf, 
-	JDIMENSION output_col);
-
 
 /*
  * Prepare for an output pass.
@@ -108,18 +102,24 @@ start_pass (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Select the proper IDCT routine for this component's scaling */
-    switch (compptr->DCT_scaled_size) {
+    switch (compptr->_DCT_scaled_size) {
 #ifdef IDCT_SCALING_SUPPORTED
     case 1:
       method_ptr = jpeg_idct_1x1;
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 2:
-      method_ptr = jpeg_idct_2x2;
+      if (jsimd_can_idct_2x2())
+        method_ptr = jsimd_idct_2x2;
+      else
+        method_ptr = jpeg_idct_2x2;
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 4:
-      method_ptr = jpeg_idct_4x4;
+      if (jsimd_can_idct_4x4())
+        method_ptr = jsimd_idct_4x4;
+      else
+        method_ptr = jpeg_idct_4x4;
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
 #endif
@@ -127,47 +127,28 @@ start_pass (j_decompress_ptr cinfo)
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-		if(SSE2Available == 1)
-		{
-			method_ptr = jpeg_idct_islow_sse2;
-			method = JDCT_ISLOW;
-		}
-		else
-		{
-			method_ptr = jpeg_idct_islow;
-			method = JDCT_ISLOW;
-		}
-#else
-		method_ptr = jpeg_idct_islow;
-		method = JDCT_ISLOW;
-		  
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+	if (jsimd_can_idct_islow())
+	  method_ptr = jsimd_idct_islow;
+	else
+	  method_ptr = jpeg_idct_islow;
+	method = JDCT_ISLOW;
 	break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-		if (SSE2Available==1) 
-		{
-			method_ptr = jpeg_idct_islow_sse2;
-			method = JDCT_ISLOW;
-		}
-		else
-		{
-			method_ptr = jpeg_idct_ifast;
-			method = JDCT_IFAST;
-		}
-#else
-		method_ptr = jpeg_idct_ifast;
-		method = JDCT_IFAST;
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+	if (jsimd_can_idct_ifast())
+	  method_ptr = jsimd_idct_ifast;
+	else
+	  method_ptr = jpeg_idct_ifast;
+	method = JDCT_IFAST;
 	break;
-
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
-	method_ptr = jpeg_idct_float;
+	if (jsimd_can_idct_float())
+	  method_ptr = jsimd_idct_float;
+	else
+	  method_ptr = jpeg_idct_float;
 	method = JDCT_FLOAT;
 	break;
 #endif
@@ -177,7 +158,7 @@ start_pass (j_decompress_ptr cinfo)
       }
       break;
     default:
-      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size);
+      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->_DCT_scaled_size);
       break;
     }
     idct->pub.inverse_DCT[ci] = method_ptr;
diff --git a/jpeg/jdinput.c b/jpeg/jdinput.c
index 0c2ac8f120bc..9fcd089d3cf6 100644
--- a/jpeg/jdinput.c
+++ b/jpeg/jdinput.c
@@ -2,6 +2,8 @@
  * jdinput.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -14,6 +16,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /* Private state */
@@ -35,6 +38,79 @@ METHODDEF(int) consume_markers JPP((j_decompress_ptr cinfo));
  * Routines to calculate various quantities related to the size of the image.
  */
 
+
+#if JPEG_LIB_VERSION >= 80
+/*
+ * Compute output image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+/* Do computations that are needed before master selection phase.
+ * This function is used for transcoding and full decompression.
+ */
+{
+#ifdef IDCT_SCALING_SUPPORTED
+  int ci;
+  jpeg_component_info *compptr;
+
+  /* Compute actual output image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom) {
+    /* Provide 1/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 2) {
+    /* Provide 2/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 4) {
+    /* Provide 4/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 8) {
+    /* Provide 8/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  }
+  /* Recompute dimensions of components */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size;
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size;
+  }
+
+#else /* !IDCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->output_width = cinfo->image_width;
+  cinfo->output_height = cinfo->image_height;
+  /* jdinput.c has already initialized DCT_scaled_size,
+   * and has computed unscaled downsampled_width and downsampled_height.
+   */
+
+#endif /* IDCT_SCALING_SUPPORTED */
+}
+#endif
+
+
 LOCAL(void)
 initial_setup (j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
@@ -70,16 +146,30 @@ initial_setup (j_decompress_ptr cinfo)
 				   compptr->v_samp_factor);
   }
 
+#if JPEG_LIB_VERSION >=80
+    cinfo->block_size = DCTSIZE;
+    cinfo->natural_order = jpeg_natural_order;
+    cinfo->lim_Se = DCTSIZE2-1;
+#endif
+
   /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
    * In the full decompressor, this will be overridden by jdmaster.c;
    * but in the transcoder, jdmaster.c is not used, so we must do it here.
    */
+#if JPEG_LIB_VERSION >= 70
+  cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#else
   cinfo->min_DCT_scaled_size = DCTSIZE;
+#endif
 
   /* Compute dimensions of components */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+#if JPEG_LIB_VERSION >= 70
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+#else
     compptr->DCT_scaled_size = DCTSIZE;
+#endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
@@ -138,7 +228,7 @@ per_scan_setup (j_decompress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = compptr->DCT_scaled_size;
+    compptr->MCU_sample_width = compptr->_DCT_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -174,7 +264,7 @@ per_scan_setup (j_decompress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->_DCT_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
diff --git a/jpeg/jdmainct.c b/jpeg/jdmainct.c
index 13c956f5deb7..67f62153e672 100644
--- a/jpeg/jdmainct.c
+++ b/jpeg/jdmainct.c
@@ -2,6 +2,7 @@
  * jdmainct.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -16,6 +17,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /*
@@ -161,7 +163,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 {
   my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
 
@@ -175,8 +177,8 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     /* Get space for pointer lists --- M+4 row groups in each list.
      * We alloc both pointer lists with one call to save a few cycles.
      */
@@ -202,14 +204,14 @@ make_funny_pointers (j_decompress_ptr cinfo)
 {
   my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY buf, xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     xbuf0 = main->xbuffer[0][ci];
     xbuf1 = main->xbuffer[1][ci];
     /* First copy the workspace pointers as-is */
@@ -242,14 +244,14 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
 {
   my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     xbuf0 = main->xbuffer[0][ci];
     xbuf1 = main->xbuffer[1][ci];
     for (i = 0; i < rgroup; i++) {
@@ -277,8 +279,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Count sample rows in one iMCU row and in one row group */
-    iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size;
-    rgroup = iMCUheight / cinfo->min_DCT_scaled_size;
+    iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size;
+    rgroup = iMCUheight / cinfo->_min_DCT_scaled_size;
     /* Count nondummy sample rows remaining for this component */
     rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
     if (rows_left == 0) rows_left = iMCUheight;
@@ -357,7 +359,7 @@ process_data_simple_main (j_decompress_ptr cinfo,
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION) cinfo->_min_DCT_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
@@ -417,7 +419,7 @@ process_data_context_main (j_decompress_ptr cinfo,
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
     main->rowgroup_ctr = 0;
-    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size - 1);
     /* Check for bottom of image: if so, tweak pointers to "duplicate"
      * the last sample row, and adjust rowgroups_avail to ignore padding rows.
      */
@@ -440,8 +442,8 @@ process_data_context_main (j_decompress_ptr cinfo,
     main->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
-    main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1);
-    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2);
+    main->rowgroup_ctr = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 2);
     main->context_state = CTX_POSTPONED_ROW;
   }
 }
@@ -492,21 +494,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
    * ngroups is the number of row groups we need.
    */
   if (cinfo->upsample->need_context_rows) {
-    if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */
+    if (cinfo->_min_DCT_scaled_size < 2) /* unsupported, see comments above */
       ERREXIT(cinfo, JERR_NOTIMPL);
     alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
-    ngroups = cinfo->min_DCT_scaled_size + 2;
+    ngroups = cinfo->_min_DCT_scaled_size + 2;
   } else {
-    ngroups = cinfo->min_DCT_scaled_size;
+    ngroups = cinfo->_min_DCT_scaled_size;
   }
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 			((j_common_ptr) cinfo, JPOOL_IMAGE,
-			 compptr->width_in_blocks * compptr->DCT_scaled_size,
+			 compptr->width_in_blocks * compptr->_DCT_scaled_size,
 			 (JDIMENSION) (rgroup * ngroups));
   }
 }
diff --git a/jpeg/jdmarker.c b/jpeg/jdmarker.c
index c0fd8c2a8c0d..f4cca8cc835c 100644
--- a/jpeg/jdmarker.c
+++ b/jpeg/jdmarker.c
@@ -79,7 +79,9 @@ typedef enum {			/* JPEG marker codes */
   M_JPG13 = 0xfd,
   M_COM   = 0xfe,
   
-  M_TEM   = 0x01
+  M_TEM   = 0x01,
+  
+  M_ERROR = 0x100
 } JPEG_MARKER;
 
 
diff --git a/jpeg/jdmaster.c b/jpeg/jdmaster.c
index 8d692350dfcb..14520da884c7 100644
--- a/jpeg/jdmaster.c
+++ b/jpeg/jdmaster.c
@@ -2,6 +2,7 @@
  * jdmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009-2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -14,105 +15,8 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
-/* Use static array */
-
-const JSAMPLE static_range_table[ (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * SIZEOF(JSAMPLE) ]={
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 
- 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 
- 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 
- 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 
- 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 
- 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 
- 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 
- 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 
- 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 
- 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 
- 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 
- 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 
- 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 
- 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 
- 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 
- 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 
- 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 
- 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 
- 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 
- 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 
- 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 
- 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 
- 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 
- 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 
- 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 
- 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f
-};
 
 /* Private state */
 
@@ -147,8 +51,14 @@ use_merged_upsample (j_decompress_ptr cinfo)
     return FALSE;
   /* jdmerge.c only supports YCC=>RGB color conversion */
   if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
-      cinfo->out_color_space != JCS_RGB ||
-      cinfo->out_color_components != RGB_PIXELSIZE)
+      (cinfo->out_color_space != JCS_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGBX &&
+      cinfo->out_color_space != JCS_EXT_BGR &&
+      cinfo->out_color_space != JCS_EXT_BGRX &&
+      cinfo->out_color_space != JCS_EXT_XBGR &&
+      cinfo->out_color_space != JCS_EXT_XRGB) ||
+      cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space])
     return FALSE;
   /* and it only handles 2h1v or 2h2v sampling ratios */
   if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -159,9 +69,9 @@ use_merged_upsample (j_decompress_ptr cinfo)
       cinfo->comp_info[2].v_samp_factor != 1)
     return FALSE;
   /* furthermore, it doesn't work if we've scaled the IDCTs differently */
-  if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size)
+  if (cinfo->comp_info[0]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
+      cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
+      cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;			/* by golly, it'll work... */
@@ -200,26 +110,42 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
       jdiv_round_up((long) cinfo->image_width, 8L);
     cinfo->output_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height, 8L);
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = 1;
+#else
     cinfo->min_DCT_scaled_size = 1;
+#endif
   } else if (cinfo->scale_num * 4 <= cinfo->scale_denom) {
     /* Provide 1/4 scaling */
     cinfo->output_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width, 4L);
     cinfo->output_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height, 4L);
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = 2;
+#else
     cinfo->min_DCT_scaled_size = 2;
+#endif
   } else if (cinfo->scale_num * 2 <= cinfo->scale_denom) {
     /* Provide 1/2 scaling */
     cinfo->output_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width, 2L);
     cinfo->output_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height, 2L);
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = 4;
+#else
     cinfo->min_DCT_scaled_size = 4;
+#endif
   } else {
     /* Provide 1/1 scaling */
     cinfo->output_width = cinfo->image_width;
     cinfo->output_height = cinfo->image_height;
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#else
     cinfo->min_DCT_scaled_size = DCTSIZE;
+#endif
   }
   /* In selecting the actual DCT scaling for each component, we try to
    * scale up the chroma components via IDCT scaling rather than upsampling.
@@ -228,15 +154,19 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
    */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    int ssize = cinfo->min_DCT_scaled_size;
+    int ssize = cinfo->_min_DCT_scaled_size;
     while (ssize < DCTSIZE &&
 	   (compptr->h_samp_factor * ssize * 2 <=
-	    cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) &&
+	    cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) &&
 	   (compptr->v_samp_factor * ssize * 2 <=
-	    cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) {
+	    cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size)) {
       ssize = ssize * 2;
     }
+#if JPEG_LIB_VERSION >= 70
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = ssize;
+#else
     compptr->DCT_scaled_size = ssize;
+#endif
   }
 
   /* Recompute downsampled dimensions of components;
@@ -247,11 +177,11 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width *
-		    (long) (compptr->h_samp_factor * compptr->DCT_scaled_size),
+		    (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
 		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height *
-		    (long) (compptr->v_samp_factor * compptr->DCT_scaled_size),
+		    (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
 		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
   }
 
@@ -273,10 +203,14 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
     cinfo->out_color_components = 1;
     break;
   case JCS_RGB:
-#if RGB_PIXELSIZE != 3
-    cinfo->out_color_components = RGB_PIXELSIZE;
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
     break;
-#endif /* else share code with YCbCr */
   case JCS_YCbCr:
     cinfo->out_color_components = 3;
     break;
@@ -346,14 +280,6 @@ LOCAL(void)
 prepare_range_limit_table (j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
-  /* Use a static table and allow negative subscripts of simple table */
-
-  cinfo->sample_range_limit = (JSAMPLE *) static_range_table + (MAXJSAMPLE+1);
-
-  /* This code is used to create the values for the static table used above */
-
-#if 0
-
   JSAMPLE * table;
   int i;
 
@@ -376,8 +302,6 @@ prepare_range_limit_table (j_decompress_ptr cinfo)
 	  (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * SIZEOF(JSAMPLE));
   MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE),
 	  cinfo->sample_range_limit, CENTERJSAMPLE * SIZEOF(JSAMPLE));
-
-#endif /* 0 */
 }
 
 
@@ -481,7 +405,11 @@ master_selection (j_decompress_ptr cinfo)
   jinit_inverse_dct(cinfo);
   /* Entropy decoding: either Huffman or arithmetic coding. */
   if (cinfo->arith_code) {
+#ifdef D_ARITH_CODING_SUPPORTED
+    jinit_arith_decoder(cinfo);
+#else
     ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
   } else {
     if (cinfo->progressive_mode) {
 #ifdef D_PROGRESSIVE_SUPPORTED
diff --git a/jpeg/jdmerge.c b/jpeg/jdmerge.c
index 3239ddbde82d..edf061a737b1 100644
--- a/jpeg/jdmerge.c
+++ b/jpeg/jdmerge.c
@@ -2,6 +2,8 @@
  * jdmerge.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -35,22 +37,10 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-  __int64 const1 = 0x59BA0000D24B59BA;       // Cr_r Cr_b Cr_g Cr_r
-  __int64 const2 = 0x00007168E9FA0000;		 // Cb-r Cb_b Cb_g Cb_r
-  __int64 const5 = 0x0000D24B59BA0000;		 // Cr_b Cr_g Cr_r Cr_b
-  __int64 const6 = 0x7168E9FA00007168;		 // Cb_b Cb_g Cb_r Cb_b
-
-  // constants for factors (One_Half/fix(x)) << 2
-
-  __int64 const05 = 0x0001000000000001;	// Cr_r Cr_b Cr_g Cr_r
-  __int64 const15 = 0x00000001FFFA0000;	// Cb-r Cb_b Cb_g Cb_r
-  __int64 const45 = 0x0000000000010000;	// Cr_b Cr_g Cr_r Cr_b
-  __int64 const55 = 0x0001FFFA00000001;	// Cb_b Cb_g Cb_r Cb_b
-#endif
 
 /* Private subobject */
 
@@ -240,9 +230,7 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
 		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
 		      JSAMPARRAY output_buf)
 {
- 
-
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
@@ -270,15 +258,15 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
     y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
-    outptr += RGB_PIXELSIZE;
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
+    outptr += rgb_pixelsize[cinfo->out_color_space];
     y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
-    outptr += RGB_PIXELSIZE;
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
+    outptr += rgb_pixelsize[cinfo->out_color_space];
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
@@ -288,9 +276,9 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr0);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
   }
 }
 
@@ -299,614 +287,6 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
  * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
  */
 
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-__inline METHODDEF(void)
-h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf);
-__inline METHODDEF(void)
-h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf);
-#endif
- 
-METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf);
-
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
-{
-if (MMXAvailable && (cinfo->image_width >= 8))
-	h2v2_merged_upsample_mmx (cinfo, input_buf, in_row_group_ctr, output_buf);
-else
-	h2v2_merged_upsample_orig (cinfo, input_buf, in_row_group_ctr, output_buf);
-
-}
-
-__inline METHODDEF(void)
-h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
-{
-
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  SHIFT_TEMPS
-
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr0 = output_buf[0];
-  outptr1 = output_buf[1];
-  /* Loop for each group of output pixels */
-  for (col = cinfo->output_width >> 1; col > 0; col--) {
-    /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-  }
-  /* If image width is odd, do the last output column separately */
-  if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-  }
-}
-
-/*
- * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
- */
-__inline METHODDEF(void)
-h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
-{
-	// added for MMX
-  __int64 const128 = 0x0080008000800080;
-  __int64 empty = 0x0000000000000000;
-  __int64 davemask = 0x0000FFFFFFFF0000;
-  ////////////////////////////////
-
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  SHIFT_TEMPS
-  
-
-  // Added for MMX	  
-  register int width = cinfo->image_width;
-  int cols = cinfo->output_width;
-  int cols_asm = (cols >> 3);
-  int diff = cols - (cols_asm<<3);
-  int cols_asm_copy = cols_asm;
-
- ///////////////////////////////////////
-
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr0 = output_buf[0];
-  outptr1 = output_buf[1];
-  /* Loop for each group of output pixels */
-
-	   
-  _asm
-  {
-	  mov esi, inptr00
-
-	  mov eax, inptr01
-	  
-	  mov ebx, inptr2
-
-	  mov ecx, inptr1
-
-	  mov edi, outptr0
-
-	  mov edx, outptr1
-
-do_next16:
-	  
-	  movd mm0, [ebx]			; Cr7 Cr6.....Cr1 Cr0
-
-	  pxor mm6, mm6
-
-	  punpcklbw mm0, mm0		; Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0
-
-	  movq mm7, const128
-
-	  punpcklwd mm0, mm0		; Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0
-
-	  movq mm4, mm0
-
-	  punpcklbw mm0, mm6		; Cr0 Cr0 Cr0 Cr0
-
-	  psubsw mm0, mm7			; Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128
-	  
-	  movd mm1, [ecx]			; Cb7 Cb6...... Cb1 Cb0
-	  	   
-	  psllw mm0, 2				; left shift by 2 bits
-
-	  punpcklbw mm1, mm1		; Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0
-	  
-	  paddsw mm0, const05		; add (one_half/fix(x)) << 2
-
-	  punpcklwd mm1, mm1		; Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0
-
-	  movq mm5, mm1
-
-	  pmulhw mm0, const1		; multiply by (fix(x) >> 1) 
-
-	  punpcklbw mm1, mm6		; Cb0 Cb0 Cb0 Cb0
-
-	  punpckhbw mm4, mm6		; Cr1 Cr1 Cr1 Cr1
-
-	  psubsw mm1, mm7			; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
-
-	  punpckhbw mm5, mm6		; Cb1 Cb1 Cb1 Cb1
-
-	  psllw mm1, 2				; left shift by 2 bits
- 
-	  paddsw mm1, const15		; add (one_half/fix(x)) << 2
-
-	  psubsw mm4, mm7			; Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128
-						
-	  psubsw mm5, mm7			; Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128
-
-	  pmulhw mm1, const2		; multiply by (fix(x) >> 1) 
-
-	  psllw mm4, 2				; left shift by 2 bits
-
-	  psllw mm5, 2				; left shift by 2 bits
-
-	  paddsw mm4, const45		; add (one_half/fix(x)) << 2
-
-	  movd mm7, [esi]			;  Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0
-
-	  pmulhw mm4, const5		; multiply by (fix(x) >> 1) 
-
-	  movq mm6, mm7
-
-	  punpcklbw mm7, mm7		; Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0
-
-	  paddsw mm5, const55		; add (one_half/fix(x)) << 2
-
-	  paddsw  mm0, mm1			; cred0 cbl0 cgr0 cred0
-
-	  movq mm1, mm7
-
-	  pmulhw mm5, const6		; multiply by (fix(x) >> 1) 
-
-	  movq	mm2, mm0			; cred0 cbl0 cgr0 cred0
-
-	  punpcklwd mm7, mm6		; Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0
-
-	  pand mm2, davemask		; 0 cbl0 cgr0 0
-
-	  psrlq mm1, 16				; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
-
-	  psrlq	mm2, 16				; 0 0 cbl0 cgr0
-
-	  punpcklbw mm7, empty		; Y1 Y0 Y0 Y0
-
-	  paddsw mm4, mm5			; cbl1 cgr1 cred1 cbl1
-
-	  movq	mm3, mm4			; cbl1 cgr1 cred1 cbl1
-
-	  pand	mm3, davemask		; 0 cgr1 cred1 0
-
-	  paddsw mm7, mm0			; r1 b0 g0 r0
-
-	  psllq	mm3, 16				; cgr1 cred1 0 0
-
-	  movq mm6, mm1				; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
-	
-	  por	mm2, mm3			; cgr1 cred1 cbl0 cgr0
-
-	  punpcklbw mm6, empty		; Y4 Y4 Y1 Y1
-
-	  movd mm3, [eax]			; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
-	  
-	  paddsw mm6, mm2			; g4 r4 b1 g1
-
-	  packuswb mm7, mm6			; g4 r4 b1 g1 r1 b0 g0 r0
-
-	  movq mm6, mm3				; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
-
-	  punpcklbw mm3, mm3		; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
-
-	  movq [edi], mm7			; move to memory g4 r4 b1 g1 r1 b0 g0 r0
-
-	  movq mm5, mm3				; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
-
-	  punpcklwd mm3, mm6		; X X X X Y3 Y2 Y2 Y2
-
-	  punpcklbw mm3, empty		; Y3 Y2 Y2 Y2
-
-	  psrlq mm5, 16				; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
-
-	  paddsw mm3, mm0			; r3 b2 g2 r2
-
-	  movq mm6, mm5				; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
-
-	  movq mm0, mm1				; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
-
-	  punpckldq mm6, mm6		; X X X X Y6 Y6 Y3 Y3
-
-	  punpcklbw mm6, empty		; Y6 Y6 Y3 Y3
-
-	  psrlq mm1, 24				; 0 0 0 0 0 Y5 Y5 Y4
-	  
-	  paddsw mm6, mm2			; g6 r6 b3 g3
-
-	  packuswb mm3, mm6			; g6 r6 b3 g3 r3 b2 g2 r2
-
-	  movq mm2, mm5				; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
-
-	  psrlq mm0, 32				; 0 0 0 0 0 0 Y5 Y5
-
-	  movq [edx], mm3			; move to memory g6 r6 b3 g3 r3 b2 g2 r2
-	  
-	  punpcklwd mm1, mm0		; X X X X Y5 Y5 Y5 Y4
-
-	  psrlq mm5, 24				; 0 0 0 0 0 Y7 Y7 Y6 
-
-	  movd mm0, [ebx]			; Cr9 Cr8.....Cr3 Cr2
-
-	  psrlq mm2, 32	   			; 0 0 0 0 0 0 Y7 Y7	 
-	  
-	  psrlq	mm0, 16		
-
-	  punpcklbw mm1, empty		; Y5 Y5 Y5 Y4
-
-	  punpcklwd mm5, mm2		; X X X X Y7 Y7 Y7 Y6
-
-	  paddsw mm1, mm4			; b5 g5 r5 b4
-	 
-	  punpcklbw mm5, empty		; Y7 Y7 Y7 Y6	    
-
-	  pxor mm6, mm6				; clear mm6 registr
-	  
-	  punpcklbw mm0, mm0		; X X X X Cr3 Cr3 Cr2 Cr2
-  
-	  paddsw mm5, mm4			; b7 g7 r7 b6
-	  
-	  punpcklwd mm0, mm0		; Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2
-
-	  movq mm4, mm0
-
-	  movd mm3, [ecx]			; Cb9 Cb8...... Cb3 Cb2
-	  
-	  punpcklbw mm0, mm6		; Cr2 Cr2 Cr2 Cr2
-
-	  psrlq	mm3, 16
-
-	  psubsw mm0, const128		; Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128
-
-	  punpcklbw mm3, mm3		; X X X X Cb3 Cb3 Cb2 Cb2
-
-	  psllw mm0, 2				; left shift by 2 bits
-
-	  paddsw mm0, const05		; add (one_half/fix(x)) << 2
-
-	  punpcklwd mm3, mm3		; Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2
-
-	  movq mm7, mm3
-	  
-	  pmulhw mm0, const1		; multiply by (fix(x) >> 1) 	  	  
-
-	  punpcklbw mm3, mm6		; Cb2 Cb2 Cb2 Cb2
-
-	  psubsw mm3, const128		; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
-
-	  punpckhbw mm4, mm6		; Cr3 Cr3 Cr3 Cr3
-	  
-	  psllw mm3, 2				; left shift by 2 bits
-
-	  paddsw mm3, const15		; add (one_half/fix(x)) << 2
-
-	  punpckhbw mm7, mm6		; Cb3 Cb3 Cb3 Cb3
-
-	  pmulhw mm3, const2		; multiply by (fix(x) >> 1) 
-	  
-	  psubsw mm7, const128		; Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128
-
-	  paddsw  mm0, mm3			; cred2 cbl2 cgr2 cred2
-	    
-	  psllw mm7, 2				; left shift by 2 bits
-
-	  psubsw mm4, const128		; Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128
-	  
-	  movd mm3, [esi+4]			;  Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
-	  
-	  psllw mm4, 2				; left shift by 2 bits
-
-	  paddsw mm7, const55		; add (one_half/fix(x)) << 2
-	  	  
-	  movq mm6, mm3				;  Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
-
-	  movq	mm2, mm0
-	  	  
-	  pand mm2, davemask
-
-	  punpcklbw mm3, mm3		; Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8
-
-	  psrlq	mm2, 16
-	    	  
-	  paddsw mm4, const45		; add (one_half/fix(x)) << 2
-
-	  punpcklwd mm3, mm6		; X X X X Y9 Y8 Y8 Y8
-	  
-	  pmulhw mm4, const5		; multiply by (fix(x) >> 1) 
-
-	  pmulhw mm7, const6		; multiply by (fix(x) >> 1) 
-
-	  punpcklbw mm3, empty		; Y9 Y8 Y8 Y8
-	  
-	  paddsw mm4, mm7			; cbl3 cgr3 cred3 cbl3
-
-	  paddsw mm3, mm0			; r9 b8 g8 r8
-
-	  movq	mm7, mm4
-
-	  packuswb mm1, mm3			; r9 b8 g8 r8 b5 g5 r5 b4
-
-	  movd mm3, [eax+4]			; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
- 	  
-	  pand	mm7, davemask
-
-	  psrlq mm6, 8				; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
-
-	  psllq	mm7, 16
-						   
-	  movq [edi+8], mm1			; move to memory r9 b8 g8 r8 b5 g5 r5 b4
-
-	  por	mm2, mm7
-
-	  movq mm7, mm3				; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
-
-	  punpcklbw mm3, mm3		; X X X X Y11 Y11 Y10 Y10
-
-	  pxor mm1, mm1
-
-	  punpcklwd mm3, mm7		; X X X X Y11 Y10 Y10 Y10
-
-	  punpcklbw mm3, mm1		; Y11 Y10 Y10 Y10
-
-	  psrlq mm7, 8				; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
-	  
-	  paddsw mm3, mm0			; r11 b10 g10 r10
-
-	  movq mm0, mm7				; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
-
-	  packuswb mm5, mm3			; r11 b10 g10 r10 b7 g7 r7 b6
-
-	  punpcklbw mm7, mm7		; X X X X Y14 Y14 Y11 Y11
-
-	  movq [edx+8], mm5			; move to memory r11 b10 g10 r10 b7 g7 r7 b6
-
-	  movq mm3, mm6				; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
-
-	  punpcklbw mm6, mm6		; X X X X Y12 Y12 Y9 Y9
-
-	  punpcklbw mm7, mm1		; Y14 Y14 Y11 Y11
-
-	  punpcklbw mm6, mm1		; Y12 Y12 Y9 Y9
-
-	  paddsw mm7, mm2			; g14 r14 b11 g11
-
-	  paddsw mm6, mm2			; g12 r12 b9 g9
-
-	  psrlq mm3, 8				; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
-
-	  movq mm1, mm3				; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
-
-	  punpcklbw mm3, mm3		; X X X X Y13 Y13 Y12 Y12
-
-	  add esi, 8
-
-	  psrlq mm3, 16				; X X X X X X Y13 Y13 modified on 09/24
-
-	  punpcklwd mm1, mm3		; X X X X Y13 Y13 Y13 Y12
-
-	  add eax, 8
-
-	  psrlq mm0, 8				; 0 0 Y23 Y22 Y19 Y18 Y15 Y14	
-
-	  punpcklbw mm1, empty		; Y13 Y13 Y13 Y12
-
-	  movq mm5, mm0				; 0 0 Y23 Y22 Y19 Y18 Y15 Y14	
-
-	  punpcklbw mm0, mm0		; X X X X Y15 Y15 Y14 Y14
-
-	  paddsw mm1, mm4			; b13 g13 r13 b12
-
-	  psrlq mm0, 16				; X X X X X X Y15 Y15
-
-	  add edi, 24
-	  
-	  punpcklwd mm5, mm0		; X X X X Y15 Y15 Y15 Y14
-
-	  packuswb mm6, mm1			; b13 g13 r13 b12 g12 r12 b9 g9
-
-	  add edx, 24
-	  
-	  punpcklbw mm5, empty		; Y15 Y15 Y15 Y14
-
-	  add ebx, 4
-	  	  
-	  paddsw mm5, mm4			; b15 g15 r15 b14
-
-	  movq [edi-8], mm6		; move to memory b13 g13 r13 b12 g12 r12 b9 g9
-
-	  packuswb mm7, mm5			; b15 g15 r15 b14 g14 r14 b11 g11
-
-	  add ecx, 4
-  
-	  movq [edx-8], mm7		; move to memory b15 g15 r15 b14 g14 r14 b11 g11
-
-	  dec cols_asm
-	  
-	  jnz do_next16
-
-	  EMMS
-	  	  
-	  }
-
-	  
-  inptr1 += (cols_asm_copy<<2);
-
-  inptr2 += (cols_asm_copy<<2);
-
-  inptr00 += (cols_asm_copy<<3);
-
-  inptr01 += (cols_asm_copy<<3);
-
-  outptr0 += cols_asm_copy*24;
-
-  outptr1 += cols_asm_copy*24;
-  		  
-  //for (col = cinfo->output_width >> 1; col > 0; col--) {
-      /* Do the chroma part of the calculation */
-    /*cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];*/
-    /* Fetch 4 Y values and emit 4 pixels */
-    /*y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-  }	  */
-
-
-  for (col = diff >> 1; col > 0; col--) {
-      /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-  }	  
-
-					  
-  /* If image width is odd, do the last output column separately */
-  //if (cinfo->output_width & 1) {
-  if (diff & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-  }    
-}
-#else
-
-
 METHODDEF(void)
 h2v2_merged_upsample (j_decompress_ptr cinfo,
 		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
@@ -942,24 +322,24 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
     y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr1 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr1 += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -970,16 +350,15 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
   }
 }
-#endif
 
 
 /*
@@ -1006,14 +385,20 @@ jinit_merged_upsampler (j_decompress_ptr cinfo)
 
   if (cinfo->max_v_samp_factor == 2) {
     upsample->pub.upsample = merged_2v_upsample;
-    upsample->upmethod = h2v2_merged_upsample;
+    if (jsimd_can_h2v2_merged_upsample())
+      upsample->upmethod = jsimd_h2v2_merged_upsample;
+    else
+      upsample->upmethod = h2v2_merged_upsample;
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 		(size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
   } else {
     upsample->pub.upsample = merged_1v_upsample;
-    upsample->upmethod = h2v1_merged_upsample;
+    if (jsimd_can_h2v1_merged_upsample())
+      upsample->upmethod = jsimd_h2v1_merged_upsample;
+    else
+      upsample->upmethod = h2v1_merged_upsample;
     /* No spare row needed */
     upsample->spare_row = NULL;
   }
diff --git a/jpeg/jdsample.c b/jpeg/jdsample.c
index 80ffefb2a1cc..1864dd6b8cd5 100644
--- a/jpeg/jdsample.c
+++ b/jpeg/jdsample.c
@@ -2,6 +2,8 @@
  * jdsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -21,6 +23,8 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
+#include "jpegcomp.h"
 
 
 /* Pointer to routine to upsample a single component */
@@ -418,7 +422,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
   /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
    * so don't ask for it.
    */
-  do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1;
+  do_fancy = cinfo->do_fancy_upsampling && cinfo->_min_DCT_scaled_size > 1;
 
   /* Verify we can handle the sampling factors, select per-component methods,
    * and create storage as needed.
@@ -428,10 +432,10 @@ jinit_upsampler (j_decompress_ptr cinfo)
     /* Compute size of an "input group" after IDCT scaling.  This many samples
      * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
      */
-    h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
-    v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
+    h_in_group = (compptr->h_samp_factor * compptr->_DCT_scaled_size) /
+		 cinfo->_min_DCT_scaled_size;
+    v_in_group = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+		 cinfo->_min_DCT_scaled_size;
     h_out_group = cinfo->max_h_samp_factor;
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@@ -447,18 +451,32 @@ jinit_upsampler (j_decompress_ptr cinfo)
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      if (do_fancy && compptr->downsampled_width > 2) {
+	if (jsimd_can_h2v1_fancy_upsample())
+	  upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
+	else
+	  upsample->methods[ci] = h2v1_fancy_upsample;
+      } else {
+	if (jsimd_can_h2v1_upsample())
+	  upsample->methods[ci] = jsimd_h2v1_upsample;
+	else
+	  upsample->methods[ci] = h2v1_upsample;
+      }
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
       /* Special cases for 2h2v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
+	if (jsimd_can_h2v2_fancy_upsample())
+	  upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
+	else
+	  upsample->methods[ci] = h2v2_fancy_upsample;
 	upsample->pub.need_context_rows = TRUE;
-      } else
-	upsample->methods[ci] = h2v2_upsample;
+      } else {
+	if (jsimd_can_h2v2_upsample())
+	  upsample->methods[ci] = jsimd_h2v2_upsample;
+	else
+	  upsample->methods[ci] = h2v2_upsample;
+      }
     } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */
diff --git a/jpeg/jdtrans.c b/jpeg/jdtrans.c
new file mode 100644
index 000000000000..9d9c1b1d5e0d
--- /dev/null
+++ b/jpeg/jdtrans.c
@@ -0,0 +1,147 @@
+/*
+ * jdtrans.c
+ *
+ * Copyright (C) 1995-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains library routines for transcoding decompression,
+ * that is, reading raw DCT coefficient arrays from an input JPEG file.
+ * The routines in jdapimin.c will also be needed by a transcoder.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Forward declarations */
+LOCAL(void) transdecode_master_selection JPP((j_decompress_ptr cinfo));
+
+
+/*
+ * Read the coefficient arrays from a JPEG file.
+ * jpeg_read_header must be completed before calling this.
+ *
+ * The entire image is read into a set of virtual coefficient-block arrays,
+ * one per component.  The return value is a pointer to the array of
+ * virtual-array descriptors.  These can be manipulated directly via the
+ * JPEG memory manager, or handed off to jpeg_write_coefficients().
+ * To release the memory occupied by the virtual arrays, call
+ * jpeg_finish_decompress() when done with the data.
+ *
+ * An alternative usage is to simply obtain access to the coefficient arrays
+ * during a buffered-image-mode decompression operation.  This is allowed
+ * after any jpeg_finish_output() call.  The arrays can be accessed until
+ * jpeg_finish_decompress() is called.  (Note that any call to the library
+ * may reposition the arrays, so don't rely on access_virt_barray() results
+ * to stay valid across library calls.)
+ *
+ * Returns NULL if suspended.  This case need be checked only if
+ * a suspending data source is used.
+ */
+
+GLOBAL(jvirt_barray_ptr *)
+jpeg_read_coefficients (j_decompress_ptr cinfo)
+{
+  if (cinfo->global_state == DSTATE_READY) {
+    /* First call: initialize active modules */
+    transdecode_master_selection(cinfo);
+    cinfo->global_state = DSTATE_RDCOEFS;
+  }
+  if (cinfo->global_state == DSTATE_RDCOEFS) {
+    /* Absorb whole file into the coef buffer */
+    for (;;) {
+      int retcode;
+      /* Call progress monitor hook if present */
+      if (cinfo->progress != NULL)
+	(*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+      /* Absorb some more input */
+      retcode = (*cinfo->inputctl->consume_input) (cinfo);
+      if (retcode == JPEG_SUSPENDED)
+	return NULL;
+      if (retcode == JPEG_REACHED_EOI)
+	break;
+      /* Advance progress counter if appropriate */
+      if (cinfo->progress != NULL &&
+	  (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
+	if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
+	  /* startup underestimated number of scans; ratchet up one scan */
+	  cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+	}
+      }
+    }
+    /* Set state so that jpeg_finish_decompress does the right thing */
+    cinfo->global_state = DSTATE_STOPPING;
+  }
+  /* At this point we should be in state DSTATE_STOPPING if being used
+   * standalone, or in state DSTATE_BUFIMAGE if being invoked to get access
+   * to the coefficients during a full buffered-image-mode decompression.
+   */
+  if ((cinfo->global_state == DSTATE_STOPPING ||
+       cinfo->global_state == DSTATE_BUFIMAGE) && cinfo->buffered_image) {
+    return cinfo->coef->coef_arrays;
+  }
+  /* Oops, improper usage */
+  ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+  return NULL;			/* keep compiler happy */
+}
+
+
+/*
+ * Master selection of decompression modules for transcoding.
+ * This substitutes for jdmaster.c's initialization of the full decompressor.
+ */
+
+LOCAL(void)
+transdecode_master_selection (j_decompress_ptr cinfo)
+{
+  /* This is effectively a buffered-image operation. */
+  cinfo->buffered_image = TRUE;
+
+  /* Entropy decoding: either Huffman or arithmetic coding. */
+  if (cinfo->arith_code) {
+#ifdef D_ARITH_CODING_SUPPORTED
+    jinit_arith_decoder(cinfo);
+#else
+    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+  } else {
+    if (cinfo->progressive_mode) {
+#ifdef D_PROGRESSIVE_SUPPORTED
+      jinit_phuff_decoder(cinfo);
+#else
+      ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+    } else
+      jinit_huff_decoder(cinfo);
+  }
+
+  /* Always get a full-image coefficient buffer. */
+  jinit_d_coef_controller(cinfo, TRUE);
+
+  /* We can now tell the memory manager to allocate virtual arrays. */
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+
+  /* Initialize input side of decompressor to consume first scan. */
+  (*cinfo->inputctl->start_input_pass) (cinfo);
+
+  /* Initialize progress monitoring. */
+  if (cinfo->progress != NULL) {
+    int nscans;
+    /* Estimate number of scans to set pass_limit. */
+    if (cinfo->progressive_mode) {
+      /* Arbitrarily estimate 2 interleaved DC scans + 3 AC scans/component. */
+      nscans = 2 + 3 * cinfo->num_components;
+    } else if (cinfo->inputctl->has_multiple_scans) {
+      /* For a nonprogressive multiscan file, estimate 1 scan per component. */
+      nscans = cinfo->num_components;
+    } else {
+      nscans = 1;
+    }
+    cinfo->progress->pass_counter = 0L;
+    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->completed_passes = 0;
+    cinfo->progress->total_passes = 1;
+  }
+}
diff --git a/jpeg/jerror.c b/jpeg/jerror.c
index 2a8c8eac33f1..3da7be86a00f 100644
--- a/jpeg/jerror.c
+++ b/jpeg/jerror.c
@@ -18,13 +18,6 @@
  * These routines are used by both the compression and decompression code.
  */
 
-/*
- * This file has been modified for the Mozilla/Netscape environment.
- * Modifications are distributed under the mozilla.org tri-license and are
- * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
- * Reserved. See http://www.mozilla.org/MPL/
- */
-
 /* this is not a core library module, so it doesn't define JPEG_INTERNALS */
 #include "jinclude.h"
 #include "jpeglib.h"
@@ -82,15 +75,7 @@ error_exit (j_common_ptr cinfo)
   /* Let the memory manager delete any temp files before we die */
   jpeg_destroy(cinfo);
 
-/* Mozilla mod: in some Windows environments, the exit() function doesn't
- * even exist, so don't compile a reference to it.  Heaven help you if
- * you fail to provide a replacement error_exit function, because the
- * IJG library will NOT handle control returning from error_exit!
- */
-
-#ifndef XP_WIN
   exit(EXIT_FAILURE);
-#endif
 }
 
 
@@ -116,6 +101,15 @@ output_message (j_common_ptr cinfo)
 
   /* Create the message */
   (*cinfo->err->format_message) (cinfo, buffer);
+
+#ifdef USE_WINDOWS_MESSAGEBOX
+  /* Display it in a message dialog box */
+  MessageBox(GetActiveWindow(), buffer, "JPEG Library Error",
+	     MB_OK | MB_ICONERROR);
+#else
+  /* Send it to stderr, adding a newline */
+  fprintf(stderr, "%s\n", buffer);
+#endif
 }
 
 
diff --git a/jpeg/jerror.h b/jpeg/jerror.h
index fc2fffeac297..88f019e8f945 100644
--- a/jpeg/jerror.h
+++ b/jpeg/jerror.h
@@ -2,6 +2,7 @@
  * jerror.h
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -40,11 +41,12 @@ JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
 
 /* For maintenance convenience, list is alphabetical by message code name */
 JMESSAGE(JERR_ARITH_NOTIMPL,
-	 "Sorry, there are legal restrictions on arithmetic coding")
+	 "Sorry, arithmetic coding is not implemented")
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
 JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
 JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
 JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
 JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
 JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
@@ -93,6 +95,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
 JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
 JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
 JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
@@ -170,6 +173,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS,
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 JMESSAGE(JWRN_BOGUS_PROGRESSION,
 	 "Inconsistent progression sequence for component %d coefficient %d")
 JMESSAGE(JWRN_EXTRANEOUS_DATA,
diff --git a/jpeg/jidctfst.c b/jpeg/jidctfst.c
dissimilarity index 74%
index a94455a0e50c..dba4216fb95e 100644
--- a/jpeg/jidctfst.c
+++ b/jpeg/jidctfst.c
@@ -1,1650 +1,368 @@
-/*
- * jidctfst.c
- *
- * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains a fast, not so accurate integer implementation of the
- * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
- * must also perform dequantization of the input coefficients.
- *
- * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
- * on each row (or vice versa, but it's more convenient to emit a row at
- * a time).  Direct algorithms are also available, but they are much more
- * complex and seem not to be any faster when reduced to code.
- *
- * This implementation is based on Arai, Agui, and Nakajima's algorithm for
- * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
- * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
- * While an 8-point DCT cannot be done in less than 11 multiplies, it is
- * possible to arrange the computation so that many of the multiplies are
- * simple scalings of the final outputs.  These multiplies can then be
- * folded into the multiplications or divisions by the JPEG quantization
- * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
- * to be done in the DCT itself.
- * The primary disadvantage of this method is that with fixed-point math,
- * accuracy is lost due to imprecise representation of the scaled
- * quantization values.  The smaller the quantization table entry, the less
- * precise the scaled value, so this implementation does worse with high-
- * quality-setting files than with low-quality ones.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
-
-
-#ifdef DCT_IFAST_SUPPORTED
-
-
-/*
- * This module is specialized to the case DCTSIZE = 8.
- */
-
-#if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
-#endif
-
-
-/* Scaling decisions are generally the same as in the LL&M algorithm;
- * see jidctint.c for more details.  However, we choose to descale
- * (right shift) multiplication products as soon as they are formed,
- * rather than carrying additional fractional bits into subsequent additions.
- * This compromises accuracy slightly, but it lets us save a few shifts.
- * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
- * everywhere except in the multiplications proper; this saves a good deal
- * of work on 16-bit-int machines.
- *
- * The dequantized coefficients are not integers because the AA&N scaling
- * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
- * so that the first and second IDCT rounds have the same input scaling.
- * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
- * avoid a descaling shift; this compromises accuracy rather drastically
- * for small quantization table entries, but it saves a lot of shifts.
- * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
- * so we use a much larger scaling factor to preserve accuracy.
- *
- * A final compromise is to represent the multiplicative constants to only
- * 8 fractional bits, rather than 13.  This saves some shifting work on some
- * machines, and may also reduce the cost of multiplication (since there
- * are fewer one-bits in the constants).
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define CONST_BITS  8
-#define PASS1_BITS  2
-#else
-#define CONST_BITS  8
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
-#endif
-
-/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
- * causing a lot of useless floating-point operations at run time.
- * To get around this we use the following pre-calculated constants.
- * If you change CONST_BITS you may want to add appropriate values.
- * (With a reasonable C compiler, you can just rely on the FIX() macro...)
- */
-
-#if CONST_BITS == 8
-#define FIX_1_082392200  ((INT32)  277)		/* FIX(1.082392200) */
-#define FIX_1_414213562  ((INT32)  362)		/* FIX(1.414213562) */
-#define FIX_1_847759065  ((INT32)  473)		/* FIX(1.847759065) */
-#define FIX_2_613125930  ((INT32)  669)		/* FIX(2.613125930) */
-#else
-#define FIX_1_082392200  FIX(1.082392200)
-#define FIX_1_414213562  FIX(1.414213562)
-#define FIX_1_847759065  FIX(1.847759065)
-#define FIX_2_613125930  FIX(2.613125930)
-#endif
-
-
-/* We can gain a little more speed, with a further compromise in accuracy,
- * by omitting the addition in a descaling shift.  This yields an incorrectly
- * rounded result half the time...
- */
-
-#ifndef USE_ACCURATE_ROUNDING
-#undef DESCALE
-#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
-#endif
-
-
-/* Multiply a DCTELEM variable by an INT32 constant, and immediately
- * descale to yield a DCTELEM result.
- */
-
-#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
-
-
-/* Dequantize a coefficient by multiplying it by the multiplier-table
- * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
- * multiplication will do.  For 12-bit data, the multiplier table is
- * declared INT32, so a 32-bit multiply will be used.
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
-#else
-#define DEQUANTIZE(coef,quantval)  \
-	DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
-#endif
-
-
-/* Like DESCALE, but applies to a DCTELEM and produces an int.
- * We assume that int right shift is unsigned if INT32 right shift is.
- */
-
-#ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define ISHIFT_TEMPS	DCTELEM ishift_temp;
-#if BITS_IN_JSAMPLE == 8
-#define DCTELEMBITS  16		/* DCTELEM may be 16 or 32 bits */
-#else
-#define DCTELEMBITS  32		/* DCTELEM must be 32 bits */
-#endif
-#define IRIGHT_SHIFT(x,shft)  \
-    ((ishift_temp = (x)) < 0 ? \
-     (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
-     (ishift_temp >> (shft)))
-#else
-#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
-#endif
-
-#ifdef USE_ACCURATE_ROUNDING
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
-#else
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
-#endif
-
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-__inline GLOBAL(void)
-jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col);
-__inline GLOBAL(void)
-jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col);
-#endif
-
-GLOBAL(void)
-jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col);
-
-
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-GLOBAL(void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-if (MMXAvailable)
-	jpeg_idct_ifast_mmx(cinfo, compptr, coef_block, output_buf, output_col);
-else
-	jpeg_idct_ifast_orig(cinfo, compptr, coef_block, output_buf, output_col);
-}
-#else
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- */
-
-GLOBAL (void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  DCTELEM tmp10, tmp11, tmp12, tmp13;
-  DCTELEM z5, z10, z11, z12, z13;
-  JCOEFPTR inptr;
-  IFAST_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE2];	/* buffers data between passes */
-  SHIFT_TEMPS			/* for DESCALE */
-  ISHIFT_TEMPS			/* for IDESCALE */
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; ctr--) {
-    /* Due to quantization, we will usually find that many of the input
-     * coefficients are zero, especially the AC terms.  We can exploit this
-     * by short-circuiting the IDCT calculation for any column in which all
-     * the AC terms are zero.  In that case each output is equal to the
-     * DC coefficient (with scale factor as needed).
-     * With typical images and quantization tables, half or more of the
-     * column DCT calculations can be simplified this way.
-     */
-    
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-	inptr[DCTSIZE*7] == 0) {
-      /* AC terms all zero */
-      int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
-      
-      inptr++;			/* advance pointers to next column */
-      quantptr++;
-      wsptr++;
-      continue;
-    }
-    
-    /* Even part */
-
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
-    tmp10 = tmp0 + tmp2;	/* phase 3 */
-    tmp11 = tmp0 - tmp2;
-
-    tmp13 = tmp1 + tmp3;	/* phases 5-3 */
-    tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
-
-    tmp0 = tmp10 + tmp13;	/* phase 2 */
-    tmp3 = tmp10 - tmp13;
-    tmp1 = tmp11 + tmp12;
-    tmp2 = tmp11 - tmp12;
-    
-    /* Odd part */
-
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-
-    z13 = tmp6 + tmp5;		/* phase 6 */
-    z10 = tmp6 - tmp5;
-    z11 = tmp4 + tmp7;
-    z12 = tmp4 - tmp7;
-
-    tmp7 = z11 + z13;		/* phase 5 */
-    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-
-    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
-    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
-
-    tmp6 = tmp12 - tmp7;	/* phase 2 */
-    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-    inptr++;			/* advance pointers to next column */
-    quantptr++;
-    wsptr++;
-  }
-  
-  /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3, */
-  /* and also undo the PASS1_BITS scaling. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < DCTSIZE; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* Rows of zeroes can be exploited in the same way as we did with columns.
-     * However, the column calculation has created many nonzero AC terms, so
-     * the simplification applies less often (typically 5% to 10% of the time).
-     * On machines with very fast multiplication, it's possible that the
-     * test takes more time than it's worth.  In that case this section
-     * may be commented out.
-     */
-    
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
-	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      outptr[2] = dcval;
-      outptr[3] = dcval;
-      outptr[4] = dcval;
-      outptr[5] = dcval;
-      outptr[6] = dcval;
-      outptr[7] = dcval;
-
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-    
-    /* Even part */
-
-    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
-
-    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-    tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
-	    - tmp13;
-
-    tmp0 = tmp10 + tmp13;
-    tmp3 = tmp10 - tmp13;
-    tmp1 = tmp11 + tmp12;
-    tmp2 = tmp11 - tmp12;
-
-    /* Odd part */
-
-    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
-
-    tmp7 = z11 + z13;		/* phase 5 */
-    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-
-    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
-    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
-
-    tmp6 = tmp12 - tmp7;	/* phase 2 */
-    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-			    & RANGE_MASK];
-
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-#endif
-
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-
-
-_inline GLOBAL(void)
-jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  DCTELEM tmp10, tmp11, tmp12, tmp13;
-  DCTELEM z5, z10, z11, z12, z13;
-  JCOEFPTR inptr;
-  IFAST_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE2];	/* buffers data between passes */
-  SHIFT_TEMPS			/* for DESCALE */
-  ISHIFT_TEMPS			/* for IDESCALE */
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; ctr--) {
-    /* Due to quantization, we will usually find that many of the input
-     * coefficients are zero, especially the AC terms.  We can exploit this
-     * by short-circuiting the IDCT calculation for any column in which all
-     * the AC terms are zero.  In that case each output is equal to the
-     * DC coefficient (with scale factor as needed).
-     * With typical images and quantization tables, half or more of the
-     * column DCT calculations can be simplified this way.
-     */
-    
-    if ((inptr[DCTSIZE*1] | inptr[DCTSIZE*2] | inptr[DCTSIZE*3] |
-	 inptr[DCTSIZE*4] | inptr[DCTSIZE*5] | inptr[DCTSIZE*6] |
-	 inptr[DCTSIZE*7]) == 0) {
-      /* AC terms all zero */
-      int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
-      
-      inptr++;			/* advance pointers to next column */
-      quantptr++;
-      wsptr++;
-      continue;
-    }
-    
-    /* Even part */
-
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
-    tmp10 = tmp0 + tmp2;	/* phase 3 */
-    tmp11 = tmp0 - tmp2;
-
-    tmp13 = tmp1 + tmp3;	/* phases 5-3 */
-    tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
-
-    tmp0 = tmp10 + tmp13;	/* phase 2 */
-    tmp3 = tmp10 - tmp13;
-    tmp1 = tmp11 + tmp12;
-    tmp2 = tmp11 - tmp12;
-    
-    /* Odd part */
-
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-
-    z13 = tmp6 + tmp5;		/* phase 6 */
-    z10 = tmp6 - tmp5;
-    z11 = tmp4 + tmp7;
-    z12 = tmp4 - tmp7;
-
-    tmp7 = z11 + z13;		/* phase 5 */
-    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-
-    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
-    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
-
-    tmp6 = tmp12 - tmp7;	/* phase 2 */
-    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-    inptr++;			/* advance pointers to next column */
-    quantptr++;
-    wsptr++;
-  }
-  
-  /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3, */
-  /* and also undo the PASS1_BITS scaling. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < DCTSIZE; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* Rows of zeroes can be exploited in the same way as we did with columns.
-     * However, the column calculation has created many nonzero AC terms, so
-     * the simplification applies less often (typically 5% to 10% of the time).
-     * On machines with very fast multiplication, it's possible that the
-     * test takes more time than it's worth.  In that case this section
-     * may be commented out.
-     */
-    
-#ifndef NO_ZERO_ROW_TEST
-    if ((wsptr[1] | wsptr[2] | wsptr[3] | wsptr[4] | wsptr[5] | wsptr[6] |
-	 wsptr[7]) == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      outptr[2] = dcval;
-      outptr[3] = dcval;
-      outptr[4] = dcval;
-      outptr[5] = dcval;
-      outptr[6] = dcval;
-      outptr[7] = dcval;
-
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-    
-    /* Even part */
-
-    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
-
-    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-    tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
-	    - tmp13;
-
-    tmp0 = tmp10 + tmp13;
-    tmp3 = tmp10 - tmp13;
-    tmp1 = tmp11 + tmp12;
-    tmp2 = tmp11 - tmp12;
-
-    /* Odd part */
-
-    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
-
-    tmp7 = z11 + z13;		/* phase 5 */
-    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-
-    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
-    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
-
-    tmp6 = tmp12 - tmp7;	/* phase 2 */
-    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-			    & RANGE_MASK];
-    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-			    & RANGE_MASK];
-
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-
-	static	  __int64 fix_141		= 0x5a825a825a825a82;
-	static	  __int64 fix_184n261	= 0xcf04cf04cf04cf04;
-	static	  __int64 fix_184		= 0x7641764176417641;
-	static	  __int64 fix_n184		= 0x896f896f896f896f;
-	static	  __int64 fix_108n184	= 0xcf04cf04cf04cf04;
-	static	  __int64 const_0x0080	= 0x0080008000800080;
-
-
-__inline GLOBAL(void)
-jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR inptr,
-		 JSAMPARRAY outptr, JDIMENSION output_col)
-{
-
-  int16 workspace[DCTSIZE2 + 4];	/* buffers data between passes */
-  int16 *wsptr=workspace;
-  int16 *quantptr=compptr->dct_table;
-
-  __asm{ 
-    
-	mov		edi, quantptr
-	mov		ebx, inptr
-	mov		esi, wsptr
-	add		esi, 0x07		;align wsptr to qword
-	and		esi, 0xfffffff8	;align wsptr to qword
-
-	mov		eax, esi
-
-    /* Odd part */
-
-
-	movq		mm1, [ebx + 8*10]		;load inptr[DCTSIZE*5]
-
-	pmullw		mm1, [edi + 8*10]		;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-
-	movq		mm0, [ebx + 8*6]		;load inptr[DCTSIZE*3]
-
-	pmullw		mm0, [edi + 8*6]		;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-
-	movq		mm3, [ebx + 8*2]		;load inptr[DCTSIZE*1]
-	movq	mm2, mm1					;copy tmp6	/* phase 6 */
-
-	pmullw		mm3, [edi + 8*2]		;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-
-	movq		mm4, [ebx + 8*14]		;load inptr[DCTSIZE*1]
-	paddw	mm1, mm0					;z13 = tmp6 + tmp5;
-
-	pmullw		mm4, [edi + 8*14]	    ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-	psubw	mm2, mm0					;z10 = tmp6 - tmp5   
-
-	psllw		mm2, 2				;shift z10
-	movq		mm0, mm2			;copy z10
-
-	pmulhw		mm2, fix_184n261	;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
-	movq		mm5, mm3				;copy tmp4
-
-	pmulhw		mm0, fix_n184		;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
-	paddw		mm3, mm4				;z11 = tmp4 + tmp7;
-
-	movq		mm6, mm3				;copy z11			/* phase 5 */
-	psubw		mm5, mm4				;z12 = tmp4 - tmp7;
-
-	psubw		mm6, mm1				;z11-z13
-	psllw		mm5, 2				;shift z12
-
-	movq		mm4, [ebx + 8*12]		;load inptr[DCTSIZE*6], even part
- 	movq		mm7, mm5			;copy z12
-
-	pmulhw		mm5, fix_108n184	;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
-	paddw		mm3, mm1				;tmp7 = z11 + z13;	
-
-
-    /* Even part */
-	pmulhw		mm7, fix_184		;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
-	psllw		mm6, 2
-
-	movq		mm1, [ebx + 8*4]		;load inptr[DCTSIZE*2]
-
-	pmullw		mm1, [edi + 8*4]		;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-	paddw		mm0, mm5			;tmp10
-
-	pmullw		mm4, [edi + 8*12]		;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-	paddw		mm2, mm7			;tmp12
-
-	pmulhw		mm6, fix_141			;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-	psubw		mm2, mm3		;tmp6 = tmp12 - tmp7
-
-	movq		mm5, mm1				;copy tmp1
-	paddw		mm1, mm4				;tmp13= tmp1 + tmp3;	/* phases 5-3 */
-
-	psubw		mm5, mm4				;tmp1-tmp3
-	psubw		mm6, mm2		;tmp5 = tmp11 - tmp6;
-
-	movq		[esi+8*0], mm1			;save tmp13 in workspace
-	psllw		mm5, 2					;shift tmp1-tmp3
-    
-	movq		mm7, [ebx + 8*0]		;load inptr[DCTSIZE*0]
-
-	pmulhw		mm5, fix_141			;MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
-	paddw		mm0, mm6		;tmp4 = tmp10 + tmp5;
-
-	pmullw		mm7, [edi + 8*0]		;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-
-	movq		mm4, [ebx + 8*8]		;load inptr[DCTSIZE*4]
-	
-	pmullw		mm4, [edi + 8*8]		;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-	psubw		mm5, mm1				;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
-
-	movq		[esi+8*4], mm0		;save tmp4 in workspace
-	movq		mm1, mm7			;copy tmp0	/* phase 3 */
-
-	movq		[esi+8*2], mm5		;save tmp12 in workspace
-	psubw		mm1, mm4			;tmp11 = tmp0 - tmp2; 
-
-	paddw		mm7, mm4			;tmp10 = tmp0 + tmp2;
-    movq		mm5, mm1		;copy tmp11
-	
-	paddw		mm1, [esi+8*2]	;tmp1 = tmp11 + tmp12;
-	movq		mm4, mm7		;copy tmp10		/* phase 2 */
-
-	paddw		mm7, [esi+8*0]	;tmp0 = tmp10 + tmp13;	
-
-	psubw		mm4, [esi+8*0]	;tmp3 = tmp10 - tmp13;
-	movq		mm0, mm7		;copy tmp0
-
-	psubw		mm5, [esi+8*2]	;tmp2 = tmp11 - tmp12;
-	paddw		mm7, mm3		;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-	
-	psubw		mm0, mm3			;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-
-	movq		[esi + 8*0], mm7	;wsptr[DCTSIZE*0]
-	movq		mm3, mm1			;copy tmp1
-
-	movq		[esi + 8*14], mm0	;wsptr[DCTSIZE*7]
-	paddw		mm1, mm2			;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-
-	psubw		mm3, mm2			;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-
-	movq		[esi + 8*2], mm1	;wsptr[DCTSIZE*1]
-	movq		mm1, mm4			;copy tmp3
-
-	movq		[esi + 8*12], mm3	;wsptr[DCTSIZE*6]
-
-	paddw		mm4, [esi+8*4]		;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-
-	psubw		mm1, [esi+8*4]		;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-	movq		[esi + 8*8], mm4
-	movq		mm7, mm5			;copy tmp2
-
-	paddw		mm5, mm6			;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
-
-	movq		[esi+8*6], mm1		;
-	psubw		mm7, mm6			;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-
-	movq		[esi + 8*4], mm5
-
-	movq		[esi + 8*10], mm7
-
-
-
-/*****************************************************************/
-	add		edi, 8
-	add		ebx, 8
-	add		esi, 8
-
-/*****************************************************************/
-
-
-
-
-	movq		mm1, [ebx + 8*10]		;load inptr[DCTSIZE*5]
-
-	pmullw		mm1, [edi + 8*10]		;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-
-	movq		mm0, [ebx + 8*6]		;load inptr[DCTSIZE*3]
-
-	pmullw		mm0, [edi + 8*6]		;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-
-	movq		mm3, [ebx + 8*2]		;load inptr[DCTSIZE*1]
-	movq	mm2, mm1					;copy tmp6	/* phase 6 */
-
-	pmullw		mm3, [edi + 8*2]		;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-
-	movq		mm4, [ebx + 8*14]		;load inptr[DCTSIZE*1]
-	paddw	mm1, mm0					;z13 = tmp6 + tmp5;
-
-	pmullw		mm4, [edi + 8*14]	    ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-	psubw	mm2, mm0					;z10 = tmp6 - tmp5   
-
-	psllw		mm2, 2				;shift z10
-	movq		mm0, mm2			;copy z10
-
-	pmulhw		mm2, fix_184n261	;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
-	movq		mm5, mm3				;copy tmp4
-
-	pmulhw		mm0, fix_n184		;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
-	paddw		mm3, mm4				;z11 = tmp4 + tmp7;
-
-	movq		mm6, mm3				;copy z11			/* phase 5 */
-	psubw		mm5, mm4				;z12 = tmp4 - tmp7;
-
-	psubw		mm6, mm1				;z11-z13
-	psllw		mm5, 2				;shift z12
-
-	movq		mm4, [ebx + 8*12]		;load inptr[DCTSIZE*6], even part
- 	movq		mm7, mm5			;copy z12
-
-	pmulhw		mm5, fix_108n184	;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
-	paddw		mm3, mm1				;tmp7 = z11 + z13;	
-
-
-    /* Even part */
-	pmulhw		mm7, fix_184		;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
-	psllw		mm6, 2
-
-	movq		mm1, [ebx + 8*4]		;load inptr[DCTSIZE*2]
-
-	pmullw		mm1, [edi + 8*4]		;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-	paddw		mm0, mm5			;tmp10
-
-	pmullw		mm4, [edi + 8*12]		;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-	paddw		mm2, mm7			;tmp12
-
-	pmulhw		mm6, fix_141			;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
-	psubw		mm2, mm3		;tmp6 = tmp12 - tmp7
-
-	movq		mm5, mm1				;copy tmp1
-	paddw		mm1, mm4				;tmp13= tmp1 + tmp3;	/* phases 5-3 */
-
-	psubw		mm5, mm4				;tmp1-tmp3
-	psubw		mm6, mm2		;tmp5 = tmp11 - tmp6;
-
-	movq		[esi+8*0], mm1			;save tmp13 in workspace
-	psllw		mm5, 2					;shift tmp1-tmp3
-    
-	movq		mm7, [ebx + 8*0]		;load inptr[DCTSIZE*0]
-	paddw		mm0, mm6		;tmp4 = tmp10 + tmp5;
-
-	pmulhw		mm5, fix_141			;MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
-
-	pmullw		mm7, [edi + 8*0]		;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-
-	movq		mm4, [ebx + 8*8]		;load inptr[DCTSIZE*4]
-	
-	pmullw		mm4, [edi + 8*8]		;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-	psubw		mm5, mm1				;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
-
-	movq		[esi+8*4], mm0		;save tmp4 in workspace
-	movq		mm1, mm7			;copy tmp0	/* phase 3 */
-
-	movq		[esi+8*2], mm5		;save tmp12 in workspace
-	psubw		mm1, mm4			;tmp11 = tmp0 - tmp2; 
-
-	paddw		mm7, mm4			;tmp10 = tmp0 + tmp2;
-    movq		mm5, mm1		;copy tmp11
-	
-	paddw		mm1, [esi+8*2]	;tmp1 = tmp11 + tmp12;
-	movq		mm4, mm7		;copy tmp10		/* phase 2 */
-
-	paddw		mm7, [esi+8*0]	;tmp0 = tmp10 + tmp13;	
-
-	psubw		mm4, [esi+8*0]	;tmp3 = tmp10 - tmp13;
-	movq		mm0, mm7		;copy tmp0
-
-	psubw		mm5, [esi+8*2]	;tmp2 = tmp11 - tmp12;
-	paddw		mm7, mm3		;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-	
-	psubw		mm0, mm3			;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-
-	movq		[esi + 8*0], mm7	;wsptr[DCTSIZE*0]
-	movq		mm3, mm1			;copy tmp1
-
-	movq		[esi + 8*14], mm0	;wsptr[DCTSIZE*7]
-	paddw		mm1, mm2			;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-
-	psubw		mm3, mm2			;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-
-	movq		[esi + 8*2], mm1	;wsptr[DCTSIZE*1]
-	movq		mm1, mm4			;copy tmp3
-
-	movq		[esi + 8*12], mm3	;wsptr[DCTSIZE*6]
-
-	paddw		mm4, [esi+8*4]		;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-
-	psubw		mm1, [esi+8*4]		;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-	movq		[esi + 8*8], mm4
-	movq		mm7, mm5			;copy tmp2
-
-	paddw		mm5, mm6			;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
-
-	movq		[esi+8*6], mm1		;
-	psubw		mm7, mm6			;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-
-	movq		[esi + 8*4], mm5
-
-	movq		[esi + 8*10], mm7
-
-
-
-
-/*****************************************************************/
-
-  /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3, */
-  /* and also undo the PASS1_BITS scaling. */
-
-/*****************************************************************/
-    /* Even part */
-
-	mov			esi, eax
-	mov			eax, outptr
-
-//    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-//    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-//    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
-//    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
-	movq		mm0, [esi+8*0]		;wsptr[0,0],[0,1],[0,2],[0,3]
-
-	movq		mm1, [esi+8*1]		;wsptr[0,4],[0,5],[0,6],[0,7]
-	movq		mm2, mm0
-	
-	movq		mm3, [esi+8*2]		;wsptr[1,0],[1,1],[1,2],[1,3]
-	paddw		mm0, mm1			;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
-
-	movq		mm4, [esi+8*3]		;wsptr[1,4],[1,5],[1,6],[1,7]
-	psubw		mm2, mm1			;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
-
-	movq		mm6, mm0
-	movq		mm5, mm3
-	
-	paddw		mm3, mm4			;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
-	movq		mm1, mm2
-
-	psubw		mm5, mm4			;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
-	punpcklwd	mm0, mm3			;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
-
-	movq		mm7, [esi+8*7]		;wsptr[3,4],[3,5],[3,6],[3,7]
-	punpckhwd	mm6, mm3			;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
-
-	movq		mm3, [esi+8*4]		;wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckldq	mm0, mm6	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-
-	punpcklwd	mm1, mm5			;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
-	movq		mm4, mm3
-
-	movq		mm6, [esi+8*6]		;wsptr[3,0],[3,1],[3,2],[3,3]
-	punpckhwd	mm2, mm5			;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
-
-	movq		mm5, [esi+8*5]		;wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckldq	mm1, mm2	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-	
-	paddw		mm3, mm5			;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
-	movq		mm2, mm6
-
-	psubw		mm4, mm5			;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
-	paddw		mm6, mm7			;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
-
-	movq		mm5, mm3
-	punpcklwd	mm3, mm6			;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
-	
-	psubw		mm2, mm7			;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
-	punpckhwd	mm5, mm6			;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
-
-	movq		mm7, mm4
-	punpckldq	mm3, mm5	;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
-
-	punpcklwd	mm4, mm2			;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
-
-	punpckhwd	mm7, mm2			;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
-
-	punpckldq	mm4, mm7	;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
-	movq		mm6, mm1
-
-//	mm0 = 	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-//	mm1 =	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-
-	movq		mm2, mm0
-	punpckhdq	mm6, mm4	;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
-
-	punpckldq	mm1, mm4	;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
-	psllw		mm6, 2
-
-	pmulhw		mm6, fix_141
-	punpckldq	mm0, mm3	;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
-
-	punpckhdq	mm2, mm3	;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
-	movq		mm7, mm0
-
-//    tmp0 = tmp10 + tmp13;
-//    tmp3 = tmp10 - tmp13;
-	paddw		mm0, mm2	;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
-	psubw		mm7, mm2	;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
-
-//    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
-	psubw		mm6, mm2	;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
-//    tmp1 = tmp11 + tmp12;
-//    tmp2 = tmp11 - tmp12;
-	movq		mm5, mm1
-
-
-
-    /* Odd part */
-
-//    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-//    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-//    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-//    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
-	movq		mm3, [esi+8*0]		;wsptr[0,0],[0,1],[0,2],[0,3]
-	paddw		mm1, mm6	;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
-
-	movq		mm4, [esi+8*1]		;wsptr[0,4],[0,5],[0,6],[0,7]
-	psubw		mm5, mm6	;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
-
-	movq		mm6, mm3
-	punpckldq	mm3, mm4			;wsptr[0,0],[0,1],[0,4],[0,5]
-
-	punpckhdq	mm4, mm6			;wsptr[0,6],[0,7],[0,2],[0,3]
-	movq		mm2, mm3
-
-//Save tmp0 and tmp1 in wsptr
-	movq		[esi+8*0], mm0		;save tmp0
-	paddw		mm2, mm4			;wsptr[xxx],[0,z11],[xxx],[0,z13]
-
-	
-//Continue with z10 --- z13
-	movq		mm6, [esi+8*2]		;wsptr[1,0],[1,1],[1,2],[1,3]
-	psubw		mm3, mm4			;wsptr[xxx],[0,z12],[xxx],[0,z10]
-
-	movq		mm0, [esi+8*3]		;wsptr[1,4],[1,5],[1,6],[1,7]
-	movq		mm4, mm6
-
-	movq		[esi+8*1], mm1		;save tmp1
-	punpckldq	mm6, mm0			;wsptr[1,0],[1,1],[1,4],[1,5]
-
-	punpckhdq	mm0, mm4			;wsptr[1,6],[1,7],[1,2],[1,3]
-	movq		mm1, mm6
-	
-//Save tmp2 and tmp3 in wsptr
-	paddw		mm6, mm0		;wsptr[xxx],[1,z11],[xxx],[1,z13]
-	movq		mm4, mm2
-	
-//Continue with z10 --- z13
-	movq		[esi+8*2], mm5		;save tmp2
-	punpcklwd	mm2, mm6		;wsptr[xxx],[xxx],[0,z11],[1,z11]
-
-	psubw		mm1, mm0		;wsptr[xxx],[1,z12],[xxx],[1,z10]
-	punpckhwd	mm4, mm6		;wsptr[xxx],[xxx],[0,z13],[1,z13]
-
-	movq		mm0, mm3
-	punpcklwd	mm3, mm1		;wsptr[xxx],[xxx],[0,z12],[1,z12]
-
-	movq		[esi+8*3], mm7		;save tmp3
-	punpckhwd	mm0, mm1		;wsptr[xxx],[xxx],[0,z10],[1,z10]
-
-	movq		mm6, [esi+8*4]		;wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckhdq	mm0, mm2		;wsptr[0,z10],[1,z10],[0,z11],[1,z11]
-
-	movq		mm7, [esi+8*5]		;wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckhdq	mm3, mm4		;wsptr[0,z12],[1,z12],[0,z13],[1,z13]
-
-	movq		mm1, [esi+8*6]		;wsptr[3,0],[3,1],[3,2],[3,3]
-	movq		mm4, mm6
-
-	punpckldq	mm6, mm7			;wsptr[2,0],[2,1],[2,4],[2,5]
-	movq		mm5, mm1
-
-	punpckhdq	mm7, mm4			;wsptr[2,6],[2,7],[2,2],[2,3]
-	movq		mm2, mm6
-	
-	movq		mm4, [esi+8*7]		;wsptr[3,4],[3,5],[3,6],[3,7]
-	paddw		mm6, mm7		;wsptr[xxx],[2,z11],[xxx],[2,z13]
-
-	psubw		mm2, mm7		;wsptr[xxx],[2,z12],[xxx],[2,z10]
-	punpckldq	mm1, mm4			;wsptr[3,0],[3,1],[3,4],[3,5]
-
-	punpckhdq	mm4, mm5			;wsptr[3,6],[3,7],[3,2],[3,3]
-	movq		mm7, mm1
-
-	paddw		mm1, mm4		;wsptr[xxx],[3,z11],[xxx],[3,z13]
-	psubw		mm7, mm4		;wsptr[xxx],[3,z12],[xxx],[3,z10]
-
-	movq		mm5, mm6
-	punpcklwd	mm6, mm1		;wsptr[xxx],[xxx],[2,z11],[3,z11]
-
-	punpckhwd	mm5, mm1		;wsptr[xxx],[xxx],[2,z13],[3,z13]
-	movq		mm4, mm2
-
-	punpcklwd	mm2, mm7		;wsptr[xxx],[xxx],[2,z12],[3,z12]
-
-	punpckhwd	mm4, mm7		;wsptr[xxx],[xxx],[2,z10],[3,z10]
-
-	punpckhdq	mm4, mm6		;wsptr[2,z10],[3,z10],[2,z11],[3,z11]
-
-	punpckhdq	mm2, mm5		;wsptr[2,z12],[3,z12],[2,z13],[3,z13]
-	movq		mm5, mm0
-
-	punpckldq	mm0, mm4		;wsptr[0,z10],[1,z10],[2,z10],[3,z10]
-
-	punpckhdq	mm5, mm4		;wsptr[0,z11],[1,z11],[2,z11],[3,z11]
-	movq		mm4, mm3
-
-	punpckhdq	mm4, mm2		;wsptr[0,z13],[1,z13],[2,z13],[3,z13]
-	movq		mm1, mm5
-
-	punpckldq	mm3, mm2		;wsptr[0,z12],[1,z12],[2,z12],[3,z12]
-//    tmp7 = z11 + z13;		/* phase 5 */
-//    tmp8 = z11 - z13;		/* phase 5 */
-	psubw		mm1, mm4		;tmp8
-
-	paddw		mm5, mm4		;tmp7
-//    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
-	psllw		mm1, 2
-
-	psllw		mm0, 2
-
-	pmulhw		mm1, fix_141	;tmp21
-//    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
-//			+ MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
-	psllw		mm3, 2
-	movq		mm7, mm0
-
-	pmulhw		mm7, fix_n184
-	movq		mm6, mm3
-
-	movq		mm2, [esi+8*0]	;tmp0,final1
-
-	pmulhw		mm6, fix_108n184
-//	 tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
-//			+ MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
-	movq		mm4, mm2		;final1
-  
-	pmulhw		mm0, fix_184n261
-	paddw		mm2, mm5		;tmp0+tmp7,final1
-
-	pmulhw		mm3, fix_184
-	psubw		mm4, mm5		;tmp0-tmp7,final1
-
-//    tmp6 = tmp22 - tmp7;	/* phase 2 */
-	psraw		mm2, 5			;outptr[0,0],[1,0],[2,0],[3,0],final1
-
-	paddsw		mm2, const_0x0080	;final1
-	paddw		mm7, mm6			;tmp20
-	psraw		mm4, 5			;outptr[0,7],[1,7],[2,7],[3,7],final1
-
-	paddsw		mm4, const_0x0080	;final1
-	paddw		mm3, mm0			;tmp22
-
-//    tmp5 = tmp21 - tmp6;
-	psubw		mm3, mm5		;tmp6
-
-//    tmp4 = tmp20 + tmp5;
-	movq		mm0, [esi+8*1]		;tmp1,final2
-	psubw		mm1, mm3		;tmp5
-
-	movq		mm6, mm0			;final2
-	paddw		mm0, mm3		;tmp1+tmp6,final2
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-
-//    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];	final1
-
-
-//    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];	final2
-	psubw		mm6, mm3		;tmp1-tmp6,final2
-	psraw		mm0, 5			;outptr[0,1],[1,1],[2,1],[3,1]
-
-	paddsw		mm0, const_0x0080
-	psraw		mm6, 5			;outptr[0,6],[1,6],[2,6],[3,6]
-	
-	paddsw		mm6, const_0x0080		;need to check this value
-	packuswb	mm0, mm4	;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
-	
-	movq		mm5, [esi+8*2]		;tmp2,final3
-	packuswb	mm2, mm6	;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
-
-//    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];	final3
-	paddw		mm7, mm1		;tmp4
-	movq		mm3, mm5
-
-	paddw		mm5, mm1		;tmp2+tmp5
-	psubw		mm3, mm1		;tmp2-tmp5
-
-	psraw		mm5, 5			;outptr[0,2],[1,2],[2,2],[3,2]
-
-	paddsw		mm5, const_0x0080
-	movq		mm4, [esi+8*3]		;tmp3,final4
-	psraw		mm3, 5			;outptr[0,5],[1,5],[2,5],[3,5]
-
-	paddsw		mm3, const_0x0080
-
-
-//    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];	final4
-	movq		mm6, mm4
-	paddw		mm4, mm7		;tmp3+tmp4
-
-	psubw		mm6, mm7		;tmp3-tmp4
-	psraw		mm4, 5			;outptr[0,4],[1,4],[2,4],[3,4]
-	mov			ecx, [eax]
-
-	paddsw		mm4, const_0x0080
-	psraw		mm6, 5			;outptr[0,3],[1,3],[2,3],[3,3]
-
-	paddsw		mm6, const_0x0080
-	packuswb	mm5, mm4	;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
-
-	packuswb	mm6, mm3	;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
-	movq		mm4, mm2
-
-	movq		mm7, mm5
-	punpcklbw	mm2, mm0	;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
-
-	punpckhbw	mm4, mm0	;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
-	movq		mm1, mm2
-
-	punpcklbw	mm5, mm6	;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
-	add		 	eax, 4
-
-	punpckhbw	mm7, mm6	;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
-
-	punpcklwd	mm2, mm5	;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
-	add			ecx, output_col
-
-	movq		mm6, mm7
-	punpckhwd	mm1, mm5	;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
-
-	movq		mm0, mm2
-	punpcklwd	mm6, mm4	;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
-
-	mov			ebx, [eax]
-	punpckldq	mm2, mm6	;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
-
-	add		 	eax, 4
-	movq		mm3, mm1
-
-	add			ebx, output_col 
-	punpckhwd	mm7, mm4	;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
-	
-	movq		[ecx], mm2
-	punpckhdq	mm0, mm6	;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
-
-	mov			ecx, [eax]
-	add		 	eax, 4
-	add			ecx, output_col
-
-	movq		[ebx], mm0
-	punpckldq	mm1, mm7	;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
-
-	mov			ebx, [eax]
-
-	add			ebx, output_col
-	punpckhdq	mm3, mm7	;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
-	movq		[ecx], mm1
-
-
-	movq		[ebx], mm3
-
-
-		
-/*******************************************************************/
-	
-
-	add			esi, 64
-	add			eax, 4
-
-/*******************************************************************/
-
-//    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-//    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-//    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
-//    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
-	movq		mm0, [esi+8*0]		;wsptr[0,0],[0,1],[0,2],[0,3]
-
-	movq		mm1, [esi+8*1]		;wsptr[0,4],[0,5],[0,6],[0,7]
-	movq		mm2, mm0
-	
-	movq		mm3, [esi+8*2]		;wsptr[1,0],[1,1],[1,2],[1,3]
-	paddw		mm0, mm1			;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
-
-	movq		mm4, [esi+8*3]		;wsptr[1,4],[1,5],[1,6],[1,7]
-	psubw		mm2, mm1			;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
-
-	movq		mm6, mm0
-	movq		mm5, mm3
-	
-	paddw		mm3, mm4			;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
-	movq		mm1, mm2
-
-	psubw		mm5, mm4			;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
-	punpcklwd	mm0, mm3			;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
-
-	movq		mm7, [esi+8*7]		;wsptr[3,4],[3,5],[3,6],[3,7]
-	punpckhwd	mm6, mm3			;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
-
-	movq		mm3, [esi+8*4]		;wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckldq	mm0, mm6	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-
-	punpcklwd	mm1, mm5			;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
-	movq		mm4, mm3
-
-	movq		mm6, [esi+8*6]		;wsptr[3,0],[3,1],[3,2],[3,3]
-	punpckhwd	mm2, mm5			;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
-
-	movq		mm5, [esi+8*5]		;wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckldq	mm1, mm2	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-	
-	paddw		mm3, mm5			;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
-	movq		mm2, mm6
-
-	psubw		mm4, mm5			;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
-	paddw		mm6, mm7			;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
-
-	movq		mm5, mm3
-	punpcklwd	mm3, mm6			;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
-	
-	psubw		mm2, mm7			;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
-	punpckhwd	mm5, mm6			;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
-
-	movq		mm7, mm4
-	punpckldq	mm3, mm5	;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
-
-	punpcklwd	mm4, mm2			;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
-
-	punpckhwd	mm7, mm2			;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
-
-	punpckldq	mm4, mm7	;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
-	movq		mm6, mm1
-
-//	mm0 = 	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
-//	mm1 =	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
-
-
-	movq		mm2, mm0
-	punpckhdq	mm6, mm4	;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
-
-	punpckldq	mm1, mm4	;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
-	psllw		mm6, 2
-
-	pmulhw		mm6, fix_141
-	punpckldq	mm0, mm3	;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
-
-	punpckhdq	mm2, mm3	;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
-	movq		mm7, mm0
-
-//    tmp0 = tmp10 + tmp13;
-//    tmp3 = tmp10 - tmp13;
-	paddw		mm0, mm2	;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
-	psubw		mm7, mm2	;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
-
-//    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
-	psubw		mm6, mm2	;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
-//    tmp1 = tmp11 + tmp12;
-//    tmp2 = tmp11 - tmp12;
-	movq		mm5, mm1
-
-
-
-    /* Odd part */
-
-//    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-//    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-//    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-//    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
-	movq		mm3, [esi+8*0]		;wsptr[0,0],[0,1],[0,2],[0,3]
-	paddw		mm1, mm6	;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
-
-	movq		mm4, [esi+8*1]		;wsptr[0,4],[0,5],[0,6],[0,7]
-	psubw		mm5, mm6	;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
-
-	movq		mm6, mm3
-	punpckldq	mm3, mm4			;wsptr[0,0],[0,1],[0,4],[0,5]
-
-	punpckhdq	mm4, mm6			;wsptr[0,6],[0,7],[0,2],[0,3]
-	movq		mm2, mm3
-
-//Save tmp0 and tmp1 in wsptr
-	movq		[esi+8*0], mm0		;save tmp0
-	paddw		mm2, mm4			;wsptr[xxx],[0,z11],[xxx],[0,z13]
-
-	
-//Continue with z10 --- z13
-	movq		mm6, [esi+8*2]		;wsptr[1,0],[1,1],[1,2],[1,3]
-	psubw		mm3, mm4			;wsptr[xxx],[0,z12],[xxx],[0,z10]
-
-	movq		mm0, [esi+8*3]		;wsptr[1,4],[1,5],[1,6],[1,7]
-	movq		mm4, mm6
-
-	movq		[esi+8*1], mm1		;save tmp1
-	punpckldq	mm6, mm0			;wsptr[1,0],[1,1],[1,4],[1,5]
-
-	punpckhdq	mm0, mm4			;wsptr[1,6],[1,7],[1,2],[1,3]
-	movq		mm1, mm6
-	
-//Save tmp2 and tmp3 in wsptr
-	paddw		mm6, mm0		;wsptr[xxx],[1,z11],[xxx],[1,z13]
-	movq		mm4, mm2
-	
-//Continue with z10 --- z13
-	movq		[esi+8*2], mm5		;save tmp2
-	punpcklwd	mm2, mm6		;wsptr[xxx],[xxx],[0,z11],[1,z11]
-
-	psubw		mm1, mm0		;wsptr[xxx],[1,z12],[xxx],[1,z10]
-	punpckhwd	mm4, mm6		;wsptr[xxx],[xxx],[0,z13],[1,z13]
-
-	movq		mm0, mm3
-	punpcklwd	mm3, mm1		;wsptr[xxx],[xxx],[0,z12],[1,z12]
-
-	movq		[esi+8*3], mm7		;save tmp3
-	punpckhwd	mm0, mm1		;wsptr[xxx],[xxx],[0,z10],[1,z10]
-
-	movq		mm6, [esi+8*4]		;wsptr[2,0],[2,1],[2,2],[2,3]
-	punpckhdq	mm0, mm2		;wsptr[0,z10],[1,z10],[0,z11],[1,z11]
-
-	movq		mm7, [esi+8*5]		;wsptr[2,4],[2,5],[2,6],[2,7]
-	punpckhdq	mm3, mm4		;wsptr[0,z12],[1,z12],[0,z13],[1,z13]
-
-	movq		mm1, [esi+8*6]		;wsptr[3,0],[3,1],[3,2],[3,3]
-	movq		mm4, mm6
-
-	punpckldq	mm6, mm7			;wsptr[2,0],[2,1],[2,4],[2,5]
-	movq		mm5, mm1
-
-	punpckhdq	mm7, mm4			;wsptr[2,6],[2,7],[2,2],[2,3]
-	movq		mm2, mm6
-	
-	movq		mm4, [esi+8*7]		;wsptr[3,4],[3,5],[3,6],[3,7]
-	paddw		mm6, mm7		;wsptr[xxx],[2,z11],[xxx],[2,z13]
-
-	psubw		mm2, mm7		;wsptr[xxx],[2,z12],[xxx],[2,z10]
-	punpckldq	mm1, mm4			;wsptr[3,0],[3,1],[3,4],[3,5]
-
-	punpckhdq	mm4, mm5			;wsptr[3,6],[3,7],[3,2],[3,3]
-	movq		mm7, mm1
-
-	paddw		mm1, mm4		;wsptr[xxx],[3,z11],[xxx],[3,z13]
-	psubw		mm7, mm4		;wsptr[xxx],[3,z12],[xxx],[3,z10]
-
-	movq		mm5, mm6
-	punpcklwd	mm6, mm1		;wsptr[xxx],[xxx],[2,z11],[3,z11]
-
-	punpckhwd	mm5, mm1		;wsptr[xxx],[xxx],[2,z13],[3,z13]
-	movq		mm4, mm2
-
-	punpcklwd	mm2, mm7		;wsptr[xxx],[xxx],[2,z12],[3,z12]
-
-	punpckhwd	mm4, mm7		;wsptr[xxx],[xxx],[2,z10],[3,z10]
-
-	punpckhdq	mm4, mm6		;wsptr[2,z10],[3,z10],[2,z11],[3,z11]
-
-	punpckhdq	mm2, mm5		;wsptr[2,z12],[3,z12],[2,z13],[3,z13]
-	movq		mm5, mm0
-
-	punpckldq	mm0, mm4		;wsptr[0,z10],[1,z10],[2,z10],[3,z10]
-
-	punpckhdq	mm5, mm4		;wsptr[0,z11],[1,z11],[2,z11],[3,z11]
-	movq		mm4, mm3
-
-	punpckhdq	mm4, mm2		;wsptr[0,z13],[1,z13],[2,z13],[3,z13]
-	movq		mm1, mm5
-
-	punpckldq	mm3, mm2		;wsptr[0,z12],[1,z12],[2,z12],[3,z12]
-//    tmp7 = z11 + z13;		/* phase 5 */
-//    tmp8 = z11 - z13;		/* phase 5 */
-	psubw		mm1, mm4		;tmp8
-
-	paddw		mm5, mm4		;tmp7
-//    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
-	psllw		mm1, 2
-
-	psllw		mm0, 2
-
-	pmulhw		mm1, fix_141	;tmp21
-//    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
-//			+ MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
-	psllw		mm3, 2
-	movq		mm7, mm0
-
-	pmulhw		mm7, fix_n184
-	movq		mm6, mm3
-
-	movq		mm2, [esi+8*0]	;tmp0,final1
-
-	pmulhw		mm6, fix_108n184
-//	 tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
-//			+ MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
-	movq		mm4, mm2		;final1
-  
-	pmulhw		mm0, fix_184n261
-	paddw		mm2, mm5		;tmp0+tmp7,final1
-
-	pmulhw		mm3, fix_184
-	psubw		mm4, mm5		;tmp0-tmp7,final1
-
-//    tmp6 = tmp22 - tmp7;	/* phase 2 */
-	psraw		mm2, 5			;outptr[0,0],[1,0],[2,0],[3,0],final1
-
-	paddsw		mm2, const_0x0080	;final1
-	paddw		mm7, mm6			;tmp20
-	psraw		mm4, 5			;outptr[0,7],[1,7],[2,7],[3,7],final1
-
-	paddsw		mm4, const_0x0080	;final1
-	paddw		mm3, mm0			;tmp22
-
-//    tmp5 = tmp21 - tmp6;
-	psubw		mm3, mm5		;tmp6
-
-//    tmp4 = tmp20 + tmp5;
-	movq		mm0, [esi+8*1]		;tmp1,final2
-	psubw		mm1, mm3		;tmp5
-
-	movq		mm6, mm0			;final2
-	paddw		mm0, mm3		;tmp1+tmp6,final2
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-
-//    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-//			    & RANGE_MASK];	final1
-
-
-//    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-//			    & RANGE_MASK];	final2
-	psubw		mm6, mm3		;tmp1-tmp6,final2
-	psraw		mm0, 5			;outptr[0,1],[1,1],[2,1],[3,1]
-
-	paddsw		mm0, const_0x0080
-	psraw		mm6, 5			;outptr[0,6],[1,6],[2,6],[3,6]
-	
-	paddsw		mm6, const_0x0080		;need to check this value
-	packuswb	mm0, mm4	;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
-	
-	movq		mm5, [esi+8*2]		;tmp2,final3
-	packuswb	mm2, mm6	;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
-
-//    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-//			    & RANGE_MASK];	final3
-	paddw		mm7, mm1		;tmp4
-	movq		mm3, mm5
-
-	paddw		mm5, mm1		;tmp2+tmp5
-	psubw		mm3, mm1		;tmp2-tmp5
-
-	psraw		mm5, 5			;outptr[0,2],[1,2],[2,2],[3,2]
-
-	paddsw		mm5, const_0x0080
-	movq		mm4, [esi+8*3]		;tmp3,final4
-	psraw		mm3, 5			;outptr[0,5],[1,5],[2,5],[3,5]
-
-	paddsw		mm3, const_0x0080
-
-
-//    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];
-//    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-//			    & RANGE_MASK];	final4
-	movq		mm6, mm4
-	paddw		mm4, mm7		;tmp3+tmp4
-
-	psubw		mm6, mm7		;tmp3-tmp4
-	psraw		mm4, 5			;outptr[0,4],[1,4],[2,4],[3,4]
-	mov			ecx, [eax]
-
-	paddsw		mm4, const_0x0080
-	psraw		mm6, 5			;outptr[0,3],[1,3],[2,3],[3,3]
-
-	paddsw		mm6, const_0x0080
-	packuswb	mm5, mm4	;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
-
-	packuswb	mm6, mm3	;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
-	movq		mm4, mm2
-
-	movq		mm7, mm5
-	punpcklbw	mm2, mm0	;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
-
-	punpckhbw	mm4, mm0	;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
-	movq		mm1, mm2
-
-	punpcklbw	mm5, mm6	;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
-	add		 	eax, 4
-
-	punpckhbw	mm7, mm6	;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
-
-	punpcklwd	mm2, mm5	;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
-	add			ecx, output_col
-
-	movq		mm6, mm7
-	punpckhwd	mm1, mm5	;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
-
-	movq		mm0, mm2
-	punpcklwd	mm6, mm4	;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
-
-	mov			ebx, [eax]
-	punpckldq	mm2, mm6	;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
-
-	add		 	eax, 4
-	movq		mm3, mm1
-
-	add			ebx, output_col 
-	punpckhwd	mm7, mm4	;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
-	
-	movq		[ecx], mm2
-	punpckhdq	mm0, mm6	;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
-
-	mov			ecx, [eax]
-	add		 	eax, 4
-	add			ecx, output_col
-
-	movq		[ebx], mm0
-	punpckldq	mm1, mm7	;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
-
-	mov			ebx, [eax]
-
-	add			ebx, output_col
-	punpckhdq	mm3, mm7	;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
-	movq		[ecx], mm1
-
-	movq		[ebx], mm3
-
-	emms
-	}
-}
-#endif
-
-#endif /* DCT_IFAST_SUPPORTED */
+/*
+ * jidctfst.c
+ *
+ * Copyright (C) 1994-1998, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains a fast, not so accurate integer implementation of the
+ * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
+ * must also perform dequantization of the input coefficients.
+ *
+ * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
+ * on each row (or vice versa, but it's more convenient to emit a row at
+ * a time).  Direct algorithms are also available, but they are much more
+ * complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on Arai, Agui, and Nakajima's algorithm for
+ * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
+ * Japanese, but the algorithm is described in the Pennebaker & Mitchell
+ * JPEG textbook (see REFERENCES section in file README).  The following code
+ * is based directly on figure 4-8 in P&M.
+ * While an 8-point DCT cannot be done in less than 11 multiplies, it is
+ * possible to arrange the computation so that many of the multiplies are
+ * simple scalings of the final outputs.  These multiplies can then be
+ * folded into the multiplications or divisions by the JPEG quantization
+ * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
+ * to be done in the DCT itself.
+ * The primary disadvantage of this method is that with fixed-point math,
+ * accuracy is lost due to imprecise representation of the scaled
+ * quantization values.  The smaller the quantization table entry, the less
+ * precise the scaled value, so this implementation does worse with high-
+ * quality-setting files than with low-quality ones.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jdct.h"		/* Private declarations for DCT subsystem */
+
+#ifdef DCT_IFAST_SUPPORTED
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/* Scaling decisions are generally the same as in the LL&M algorithm;
+ * see jidctint.c for more details.  However, we choose to descale
+ * (right shift) multiplication products as soon as they are formed,
+ * rather than carrying additional fractional bits into subsequent additions.
+ * This compromises accuracy slightly, but it lets us save a few shifts.
+ * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
+ * everywhere except in the multiplications proper; this saves a good deal
+ * of work on 16-bit-int machines.
+ *
+ * The dequantized coefficients are not integers because the AA&N scaling
+ * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
+ * so that the first and second IDCT rounds have the same input scaling.
+ * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
+ * avoid a descaling shift; this compromises accuracy rather drastically
+ * for small quantization table entries, but it saves a lot of shifts.
+ * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
+ * so we use a much larger scaling factor to preserve accuracy.
+ *
+ * A final compromise is to represent the multiplicative constants to only
+ * 8 fractional bits, rather than 13.  This saves some shifting work on some
+ * machines, and may also reduce the cost of multiplication (since there
+ * are fewer one-bits in the constants).
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define CONST_BITS  8
+#define PASS1_BITS  2
+#else
+#define CONST_BITS  8
+#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
+#endif
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 8
+#define FIX_1_082392200  ((INT32)  277)		/* FIX(1.082392200) */
+#define FIX_1_414213562  ((INT32)  362)		/* FIX(1.414213562) */
+#define FIX_1_847759065  ((INT32)  473)		/* FIX(1.847759065) */
+#define FIX_2_613125930  ((INT32)  669)		/* FIX(2.613125930) */
+#else
+#define FIX_1_082392200  FIX(1.082392200)
+#define FIX_1_414213562  FIX(1.414213562)
+#define FIX_1_847759065  FIX(1.847759065)
+#define FIX_2_613125930  FIX(2.613125930)
+#endif
+
+
+/* We can gain a little more speed, with a further compromise in accuracy,
+ * by omitting the addition in a descaling shift.  This yields an incorrectly
+ * rounded result half the time...
+ */
+
+#ifndef USE_ACCURATE_ROUNDING
+#undef DESCALE
+#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#endif
+
+
+/* Multiply a DCTELEM variable by an INT32 constant, and immediately
+ * descale to yield a DCTELEM result.
+ */
+
+#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+
+
+/* Dequantize a coefficient by multiplying it by the multiplier-table
+ * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
+ * multiplication will do.  For 12-bit data, the multiplier table is
+ * declared INT32, so a 32-bit multiply will be used.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
+#else
+#define DEQUANTIZE(coef,quantval)  \
+	DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
+#endif
+
+
+/* Like DESCALE, but applies to a DCTELEM and produces an int.
+ * We assume that int right shift is unsigned if INT32 right shift is.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	DCTELEM ishift_temp;
+#if BITS_IN_JSAMPLE == 8
+#define DCTELEMBITS  16		/* DCTELEM may be 16 or 32 bits */
+#else
+#define DCTELEMBITS  32		/* DCTELEM must be 32 bits */
+#endif
+#define IRIGHT_SHIFT(x,shft)  \
+    ((ishift_temp = (x)) < 0 ? \
+     (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
+     (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#endif
+
+#ifdef USE_ACCURATE_ROUNDING
+#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
+#else
+#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
+#endif
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ */
+
+GLOBAL(void)
+jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  DCTELEM tmp10, tmp11, tmp12, tmp13;
+  DCTELEM z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  IFAST_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[DCTSIZE2];	/* buffers data between passes */
+  SHIFT_TEMPS			/* for DESCALE */
+  ISHIFT_TEMPS			/* for IDESCALE */
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = DCTSIZE; ctr > 0; ctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any column in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * column DCT calculations can be simplified this way.
+     */
+    
+    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+	inptr[DCTSIZE*7] == 0) {
+      /* AC terms all zero */
+      int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+
+      wsptr[DCTSIZE*0] = dcval;
+      wsptr[DCTSIZE*1] = dcval;
+      wsptr[DCTSIZE*2] = dcval;
+      wsptr[DCTSIZE*3] = dcval;
+      wsptr[DCTSIZE*4] = dcval;
+      wsptr[DCTSIZE*5] = dcval;
+      wsptr[DCTSIZE*6] = dcval;
+      wsptr[DCTSIZE*7] = dcval;
+      
+      inptr++;			/* advance pointers to next column */
+      quantptr++;
+      wsptr++;
+      continue;
+    }
+    
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = tmp0 + tmp2;	/* phase 3 */
+    tmp11 = tmp0 - tmp2;
+
+    tmp13 = tmp1 + tmp3;	/* phases 5-3 */
+    tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
+
+    tmp0 = tmp10 + tmp13;	/* phase 2 */
+    tmp3 = tmp10 - tmp13;
+    tmp1 = tmp11 + tmp12;
+    tmp2 = tmp11 - tmp12;
+    
+    /* Odd part */
+
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    z13 = tmp6 + tmp5;		/* phase 6 */
+    z10 = tmp6 - tmp5;
+    z11 = tmp4 + tmp7;
+    z12 = tmp4 - tmp7;
+
+    tmp7 = z11 + z13;		/* phase 5 */
+    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
+
+    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
+    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
+    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+
+    tmp6 = tmp12 - tmp7;	/* phase 2 */
+    tmp5 = tmp11 - tmp6;
+    tmp4 = tmp10 + tmp5;
+
+    wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
+    wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
+    wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
+    wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
+    wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
+    wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
+    wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
+    wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+
+    inptr++;			/* advance pointers to next column */
+    quantptr++;
+    wsptr++;
+  }
+  
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+    /* Rows of zeroes can be exploited in the same way as we did with columns.
+     * However, the column calculation has created many nonzero AC terms, so
+     * the simplification applies less often (typically 5% to 10% of the time).
+     * On machines with very fast multiplication, it's possible that the
+     * test takes more time than it's worth.  In that case this section
+     * may be commented out.
+     */
+    
+#ifndef NO_ZERO_ROW_TEST
+    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
+	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+      /* AC terms all zero */
+      JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
+				  & RANGE_MASK];
+      
+      outptr[0] = dcval;
+      outptr[1] = dcval;
+      outptr[2] = dcval;
+      outptr[3] = dcval;
+      outptr[4] = dcval;
+      outptr[5] = dcval;
+      outptr[6] = dcval;
+      outptr[7] = dcval;
+
+      wsptr += DCTSIZE;		/* advance pointer to next row */
+      continue;
+    }
+#endif
+    
+    /* Even part */
+
+    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
+    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
+
+    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
+    tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
+	    - tmp13;
+
+    tmp0 = tmp10 + tmp13;
+    tmp3 = tmp10 - tmp13;
+    tmp1 = tmp11 + tmp12;
+    tmp2 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
+    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
+    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
+    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
+
+    tmp7 = z11 + z13;		/* phase 5 */
+    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
+
+    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
+    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
+    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+
+    tmp6 = tmp12 - tmp7;	/* phase 2 */
+    tmp5 = tmp11 - tmp6;
+    tmp4 = tmp10 + tmp5;
+
+    /* Final output stage: scale down by a factor of 8 and range-limit */
+
+    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += DCTSIZE;		/* advance pointer to next row */
+  }
+}
+
+#endif /* DCT_IFAST_SUPPORTED */
diff --git a/jpeg/jidctint.c b/jpeg/jidctint.c
index df1041ed0245..a72b3207caf5 100644
--- a/jpeg/jidctint.c
+++ b/jpeg/jidctint.c
@@ -386,578 +386,4 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   }
 }
 
-
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-
-/*
-* Intel SSE2 optimized Inverse Discrete Cosine Transform
-*
-*
-* Copyright (c) 2001-2002 Intel Corporation
-* All Rights Reserved
-*
-*
-*  Authors:
-*      Danilov G.
-*
-*
-*-----------------------------------------------------------------------------
-*
-* References:
-*    K.R. Rao and P. Yip
-*       Discrete Cosine Transform.
-*       Algorithms, Advantages, Applications.
-*       Academic Press, Inc, London, 1990.
-*    JPEG Group's software.
-*       This implementation is based on Appendix A.2 of the book (R&Y) ...
-*
-*-----------------------------------------------------------------------------
-*/
-
-typedef unsigned char   Ipp8u;
-typedef unsigned short  Ipp16u;
-typedef unsigned int    Ipp32u;
-
-typedef signed char    Ipp8s;
-typedef signed short   Ipp16s;
-typedef signed int     Ipp32s;
-
-#define BITS_INV_ACC  4			
-#define SHIFT_INV_ROW  16 - BITS_INV_ACC
-#define SHIFT_INV_COL 1 + BITS_INV_ACC
-
-#define RND_INV_ROW  1024 * (6 - BITS_INV_ACC)	/* 1 << (SHIFT_INV_ROW-1)		*/
-#define RND_INV_COL = 16 * (BITS_INV_ACC - 3)   /* 1 << (SHIFT_INV_COL-1)		*/
-#define RND_INV_CORR = RND_INV_COL - 1          /* correction -1.0 and round	*/
-
-#define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536		/* -0.5 + (16.0 or 32.0)	*/
-#define c_inv_corr_1 1877 * (6 - BITS_INV_ACC)				/* 0.9167	*/	
-#define c_inv_corr_2 1236 * (6 - BITS_INV_ACC)				/* 0.6035	*/					
-#define c_inv_corr_3 680  * (6 - BITS_INV_ACC)				/* 0.3322	*/
-#define c_inv_corr_4 0    * (6 - BITS_INV_ACC)				/* 0.0		*/	
-#define c_inv_corr_5 -569  * (6 - BITS_INV_ACC)				/* -0.278	*/
-#define c_inv_corr_6 -512  * (6 - BITS_INV_ACC)				/* -0.25	*/	
-#define c_inv_corr_7 -651  * (6 - BITS_INV_ACC)				/* -0.3176	*/	
-
-#define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0
-#define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1
-#define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2
-#define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3
-#define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4
-#define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5
-#define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6
-#define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7
-
-/* Table for rows 0,4 - constants are multiplied on cos_4_16 */
-
-__declspec(align(16)) short tab_i_04[] = { 
-	16384, 21407, 16384, 8867,		
-	-16384, 21407, 16384, -8867,	
-	16384,  -8867,  16384, -21407,  
-    16384,   8867, -16384, -21407,  
-    22725,  19266,  19266,  -4520,  
-    4520,  19266,  19266, -22725,   
-    12873, -22725,   4520, -12873,  
-    12873,   4520, -22725, -12873}; 
-
-/* Table for rows 1,7 - constants are multiplied on cos_1_16 */
-
-__declspec(align(16)) short tab_i_17[] = {
-	22725,  29692,  22725,  12299,   
-    -22725,  29692,  22725, -12299,  
-    22725, -12299,  22725, -29692,   
-    22725,  12299, -22725, -29692,   
-    31521,  26722,  26722,  -6270,   
-    6270,  26722,  26722, -31521,    
-    17855, -31521,   6270, -17855,   
-    17855,   6270, -31521, -17855};  
-
-/* Table for rows 2,6 - constants are multiplied on cos_2_16 */
-
-__declspec(align(16)) short tab_i_26[] = {
-	21407,  27969,  21407,  11585,	
-    -21407,  27969,  21407, -11585,	
-    21407, -11585,  21407, -27969,	
-    21407,  11585, -21407, -27969,	
-    29692,  25172,  25172,  -5906,	
-    5906,  25172,  25172, -29692,	
-    16819, -29692,   5906, -16819,	
-    16819,   5906, -29692, -16819};	
-
-/* Table for rows 3,5 - constants are multiplied on cos_3_16 */
-
-__declspec(align(16)) short tab_i_35[] = {
-	19266,  25172,  19266,  10426,	
-    -19266,  25172,  19266, -10426,	
-    19266, -10426,  19266, -25172,	
-    19266,  10426, -19266, -25172,	
-    26722,  22654,  22654,  -5315,	
-    5315,  22654,  22654, -26722,	
-    15137, -26722,   5315, -15137,	
-    15137,   5315, -26722, -15137};	
-	
-__declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0,
-	RND_INV_ROW_0,RND_INV_ROW_0};
-__declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1,
-	RND_INV_ROW_1,RND_INV_ROW_1};
-__declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2,
-	RND_INV_ROW_2,RND_INV_ROW_2};
-__declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3,
-	RND_INV_ROW_3,RND_INV_ROW_3};
-__declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4,
-	RND_INV_ROW_4,RND_INV_ROW_4};
-__declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5,
-	RND_INV_ROW_5,RND_INV_ROW_5};
-__declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6,
-	RND_INV_ROW_6,RND_INV_ROW_6};
-__declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7,
-	RND_INV_ROW_7,RND_INV_ROW_7};
-
-__declspec(align(16)) short tg_1_16[] = {
-	13036,  13036,  13036,  13036,	/* tg * (2<<16) + 0.5 */
-	13036,  13036,  13036,  13036};
-__declspec(align(16)) short tg_2_16[] = {
-	27146,  27146,  27146,  27146,	/* tg * (2<<16) + 0.5 */
-	27146,  27146,  27146,  27146};
-__declspec(align(16)) short tg_3_16[] = {
-	-21746, -21746, -21746, -21746,	/* tg * (2<<16) + 0.5 */
-	-21746, -21746, -21746, -21746};
-__declspec(align(16)) short cos_4_16[] = {
-	-19195, -19195, -19195, -19195,	/* cos * (2<<16) + 0.5 */
-	-19195, -19195, -19195, -19195};
-
-/*
-* In this implementation the outputs of the iDCT-1D are multiplied
-*    for rows 0,4 - on cos_4_16,
-*    for rows 1,7 - on cos_1_16,
-*    for rows 2,6 - on cos_2_16,
-*    for rows 3,5 - on cos_3_16
-* and are shifted to the left for rise of accuracy
-*
-* For used constants
-*    FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
-*
-*-----------------------------------------------------------------------------
-*
-* On the first stage the calculation is executed at once for two rows.
-* The permutation for each output row is done on second stage
-*    t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0
-*
-*-----------------------------------------------------------------------------
-*/
-	
-#define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm {	\
-	__asm pshuflw  xmm1, xmm0, 10001000b				\
-    __asm pshuflw  xmm0, xmm0, 11011101b    			\
-    __asm pshufhw  xmm1, xmm1, 10001000b    			\
-	__asm pshufhw  xmm0, xmm0, 11011101b				\
-	__asm movdqa   xmm2, XMMWORD PTR [TABLE]			\
-	__asm pmaddwd  xmm2, xmm1							\
-	__asm movdqa   xmm3, XMMWORD PTR [TABLE + 32]		\
-	__asm pmaddwd  xmm3, xmm0               			\
-	__asm pmaddwd  xmm1, XMMWORD PTR [TABLE + 16]		\
-	__asm pmaddwd  xmm0, XMMWORD PTR [TABLE + 48]		\
-	__asm pshuflw  xmm5, xmm4, 10001000b				\
-	__asm pshuflw  xmm4, xmm4, 11011101b    			\
-	__asm pshufhw  xmm5, xmm5, 10001000b    			\
-	__asm pshufhw  xmm4, xmm4, 11011101b    			\
-	__asm movdqa   xmm6, XMMWORD PTR [TABLE]			\
-	__asm pmaddwd  xmm6, xmm5               			\
-	__asm movdqa   xmm7, XMMWORD PTR [TABLE + 32]		\
-	__asm pmaddwd  xmm7, xmm4               			\
-	__asm pmaddwd  xmm5, XMMWORD PTR [TABLE + 16]		\
-	__asm pmaddwd  xmm4, XMMWORD PTR [TABLE + 48]		\
-	__asm pshufd   xmm1, xmm1, 01001110b    			\
-	__asm pshufd   xmm0, xmm0, 01001110b    			\
-	__asm paddd    xmm2, XMMWORD PTR [ROUND1]			\
-	__asm paddd    xmm3, xmm0							\
-	__asm paddd    xmm1, xmm2							\
-	__asm pshufd   xmm5, xmm5, 01001110b    			\
-	__asm pshufd   xmm4, xmm4, 01001110b    			\
-	__asm movdqa   xmm2, xmm1             				\
-	__asm psubd    xmm2, xmm3             				\
-	__asm psrad    xmm2, SHIFT_INV_ROW    				\
-	__asm paddd    xmm1, xmm3							\
-	__asm psrad    xmm1, SHIFT_INV_ROW      			\
-	__asm packssdw xmm1, xmm2							\
-	__asm paddd    xmm6, XMMWORD PTR [ROUND2]			\
-	__asm paddd    xmm7, xmm4							\
-	__asm paddd    xmm5, xmm6							\
-	__asm movdqa   xmm6, xmm5	            			\
-	__asm psubd    xmm6, xmm7               			\
-	__asm psrad    xmm6, SHIFT_INV_ROW      			\
-	__asm paddd    xmm5, xmm7							\
-	__asm psrad    xmm5, SHIFT_INV_ROW      			\
-	__asm packssdw xmm5, xmm6							\
-	}
-
-/*
-*
-* The second stage - inverse DCTs of columns
-*
-* The inputs are multiplied
-*    for rows 0,4 - on cos_4_16,
-*    for rows 1,7 - on cos_1_16,
-*    for rows 2,6 - on cos_2_16,
-*    for rows 3,5 - on cos_3_16
-* and are shifted to the left for rise of accuracy
-*/
-
-#define DCT_8_INV_COL_8R(INP, OUTP) __asm {		\
-	__asm movdqa   xmm0, [INP + 5*16]			\
-    __asm movdqa   xmm1, XMMWORD PTR tg_3_16	\
-    __asm movdqa   xmm2, xmm0            		\
-    __asm movdqa   xmm3, [INP + 3*16]   		\
-    __asm pmulhw   xmm0, xmm1           		\
-    __asm movdqa   xmm4, [INP + 7*16]   		\
-    __asm pmulhw   xmm1, xmm3           		\
-    __asm movdqa   xmm5, XMMWORD PTR tg_1_16   	\
-    __asm movdqa   xmm6, xmm4            		\
-    __asm pmulhw   xmm4, xmm5           		\
-    __asm paddsw   xmm0, xmm2           		\
-    __asm pmulhw   xmm5, [INP + 1*16]   		\
-    __asm paddsw   xmm1, xmm3           		\
-    __asm movdqa   xmm7, [INP + 6*16]    		\
-    __asm paddsw   xmm0, xmm3					\
-    __asm movdqa   xmm3, XMMWORD PTR tg_2_16	\
-    __asm psubsw   xmm2, xmm1					\
-    __asm pmulhw   xmm7, xmm3            		\
-    __asm movdqa   xmm1, xmm0            		\
-    __asm pmulhw   xmm3, [INP + 2*16]   		\
-    __asm psubsw   xmm5, xmm6					\
-    __asm paddsw   xmm4, [INP + 1*16]    		\
-    __asm paddsw   xmm0, xmm4            		\
-    __asm psubsw   xmm4, xmm1					\
-    __asm pshufhw  xmm0, xmm0, 00011011b		\
-    __asm paddsw   xmm7, [INP + 2*16]    		\
-    __asm movdqa   xmm6, xmm5					\
-    __asm psubsw   xmm3, [INP + 6*16]    		\
-    __asm psubsw   xmm5, xmm2            		\
-    __asm paddsw   xmm6, xmm2					\
-	__asm movdqa   [OUTP + 7*16], xmm0    		\
-    __asm movdqa   xmm1, xmm4            		\
-    __asm movdqa   xmm2, XMMWORD PTR cos_4_16  	\
-    __asm paddsw   xmm4, xmm5            		\
-    __asm movdqa   xmm0, XMMWORD PTR cos_4_16  	\
-    __asm pmulhw   xmm2, xmm4					\
-    __asm pshufhw  xmm6, xmm6, 00011011b		\
-    __asm movdqa   [OUTP + 3*16], xmm6    		\
-    __asm psubsw   xmm1, xmm5            		\
-    __asm movdqa   xmm6, [INP + 0*16]   		\
-    __asm pmulhw   xmm0, xmm1					\
-    __asm movdqa   xmm5, [INP + 4*16]    		\
-    __asm paddsw   xmm4, xmm2					\
-    __asm paddsw   xmm5, xmm6       			\
-    __asm psubsw   xmm6, [INP + 4*16]   		\
-    __asm paddsw   xmm0, xmm1					\
-    __asm pshufhw  xmm4, xmm4, 00011011b		\
-    __asm movdqa   xmm2, xmm5            		\
-    __asm paddsw   xmm5, xmm7            		\
-    __asm movdqa   xmm1, xmm6					\
-    __asm psubsw   xmm2, xmm7					\
-    __asm movdqa   xmm7, [OUTP + 7*16]    		\
-    __asm paddsw   xmm6, xmm3            		\
-    __asm pshufhw  xmm5, xmm5, 00011011b		\
-	__asm paddsw   xmm7, xmm5					\
-    __asm psubsw   xmm1, xmm3					\
-    __asm pshufhw  xmm6, xmm6, 00011011b		\
-	__asm movdqa   xmm3, xmm6					\
-    __asm paddsw   xmm6, xmm4            		\
-    __asm pshufhw  xmm2, xmm2, 00011011b		\
-    __asm psraw    xmm7, SHIFT_INV_COL   		\
-    __asm movdqa   [OUTP + 0*16], xmm7    		\
-    __asm movdqa   xmm7, xmm1            		\
-    __asm paddsw   xmm1, xmm0					\
-    __asm psraw    xmm6, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 1*16], xmm6    		\
-    __asm pshufhw  xmm1, xmm1, 00011011b		\
-	__asm movdqa   xmm6, [OUTP + 3*16]			\
-    __asm psubsw   xmm7, xmm0            		\
-    __asm psraw    xmm1, SHIFT_INV_COL   		\
-    __asm movdqa   [OUTP + 2*16], xmm1    		\
-    __asm psubsw   xmm5, [OUTP + 7*16]			\
-    __asm paddsw   xmm6, xmm2            		\
-    __asm psubsw   xmm2, [OUTP + 3*16]			\
-    __asm psubsw   xmm3, xmm4            		\
-    __asm psraw    xmm7, SHIFT_INV_COL  		\
-    __asm pshufhw  xmm7, xmm7, 00011011b		\
-    __asm movdqa   [OUTP + 5*16], xmm7    		\
-    __asm psraw    xmm5, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 7*16], xmm5    		\
-    __asm psraw    xmm6, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 3*16], xmm6    		\
-    __asm psraw    xmm2, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 4*16], xmm2    		\
-    __asm psraw    xmm3, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 6*16], xmm3    		\
-	}
-
-/*
-*
-*  Name:      dct_8x8_inv_16s
-*  Purpose:   Inverse Discrete Cosine Transform 8x8 with
-*             2D buffer of short int data
-*  Context:
-*      void dct_8x8_inv_16s ( short *src, short *dst )
-*  Parameters:
-*      src  - Pointer to the source buffer
-*      dst  - Pointer to the destination buffer
-*
-*/
-
-GLOBAL(void)
-dct_8x8_inv_16s ( short *src, short *dst ) {
-	
-	__asm {
-
-		mov     ecx,  src
-		mov     edx,  dst
-
-		movdqa  xmm0, [ecx+0*16]
-		movdqa  xmm4, [ecx+4*16]
-		DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4)
-		movdqa     [edx+0*16], xmm1 
-		movdqa     [edx+4*16], xmm5 
-
-		movdqa  xmm0, [ecx+1*16]
-		movdqa  xmm4, [ecx+7*16]
-		DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7)
-		movdqa     [edx+1*16], xmm1 
-		movdqa     [edx+7*16], xmm5 
-
-		movdqa  xmm0, [ecx+3*16]
-		movdqa  xmm4, [ecx+5*16]
-		DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5);
-		movdqa     [edx+3*16], xmm1 
-		movdqa     [edx+5*16], xmm5 
-
-		movdqa  xmm0, [ecx+2*16]
-		movdqa  xmm4, [ecx+6*16]
-		DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6);
-		movdqa     [edx+2*16], xmm1
-		movdqa     [edx+6*16], xmm5    
-
-		DCT_8_INV_COL_8R(edx+0, edx+0);
-	}
-}
-
-
-/*
-*  Name:
-*    ownpj_QuantInv_8x8_16s
-*
-*  Purpose:
-*    Dequantize 8x8 block of DCT coefficients
-*
-*  Context:
-*    void ownpj_QuantInv_8x8_16s
-*            Ipp16s*  pSrc,
-*            Ipp16s*  pDst,
-*      const Ipp16u*  pQTbl)*
-*
-*/
-
-GLOBAL(void)
-ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl)
-{
-	__asm {
-
-		push        ebx
-		push        ecx
-		push        edx
-		push        esi
-		push        edi
-
-		mov         esi, pSrc
-		mov         edi, pDst
-		mov         edx, pQTbl
-		mov         ecx, 4
-		mov         ebx, 32
-
-	again:
-
-		movq        mm0, QWORD PTR [esi+0]
-		movq        mm1, QWORD PTR [esi+8]
-		movq        mm2, QWORD PTR [esi+16]
-		movq        mm3, QWORD PTR [esi+24]
-
-		prefetcht0  [esi+ebx] ; fetch next cache line
-
-		pmullw      mm0, QWORD PTR [edx+0]
-		pmullw      mm1, QWORD PTR [edx+8]
-		pmullw      mm2, QWORD PTR [edx+16]
-		pmullw      mm3, QWORD PTR [edx+24]
-
-		movq        QWORD PTR [edi+0], mm0
-		movq        QWORD PTR [edi+8], mm1
-		movq        QWORD PTR [edi+16], mm2
-		movq        QWORD PTR [edi+24], mm3
-
-		add         esi, ebx
-		add         edi, ebx
-		add         edx, ebx
-		dec         ecx
-		jnz         again
-
-		emms
-
-		pop         edi
-		pop         esi
-		pop         edx
-		pop         ecx
-		pop         ebx
-	}
-}
-
-
-/*
-*  Name:
-*    ownpj_Add128_8x8_16s8u
-*
-*  Purpose:
-*    signed to unsigned conversion (level shift)
-*    for 8x8 block of DCT coefficients
-*
-*  Context:
-*    void ownpj_Add128_8x8_16s8u
-*      const Ipp16s* pSrc,
-*            Ipp8u*  pDst,
-*            int     DstStep);
-*
-*/
-
-__declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080};
-
-GLOBAL(void)
-ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep)
-{
-	__asm {
-		push        eax
-		push        ebx
-		push        ecx
-		push        edx
-		push        esi
-		push        edi
-
-		mov         esi, pSrc
-		mov         edi, pDst
-		mov         edx, DstStep
-		mov         ecx, 2
-		mov         ebx, edx
-		mov         eax, edx
-		sal         ebx, 1
-		add         eax, ebx
-		movdqa      xmm7, XMMWORD PTR const_128
-
-	again:
-
-		movdqa      xmm0, XMMWORD PTR [esi+0]  ; line 0
-		movdqa      xmm1, XMMWORD PTR [esi+16] ; line 1
-		movdqa      xmm2, XMMWORD PTR [esi+32] ; line 2
-		movdqa      xmm3, XMMWORD PTR [esi+48] ; line 3
-
-		paddw     xmm0, xmm7
-		paddw     xmm1, xmm7
-		paddw     xmm2, xmm7
-		paddw     xmm3, xmm7
-
-		packuswb  xmm0, xmm1
-		packuswb  xmm2, xmm3
-
-		movq      QWORD PTR [edi], xmm0      ;0*DstStep
-		movq      QWORD PTR [edi+ebx], xmm2  ;2*DstStep
-
-		psrldq      xmm0, 8
-		psrldq      xmm2, 8
-
-		movq      QWORD PTR [edi+edx], xmm0  ;1*DstStep
-		movq      QWORD PTR [edi+eax], xmm2  ;3*DstStep
-
-		add         edi, ebx
-		add         esi, 64
-		add         edi, ebx
-		dec         ecx
-		jnz         again
-
-		pop         edi
-		pop         esi
-		pop         edx
-		pop         ecx
-		pop         ebx
-		pop         eax
-	}
-}
-
-
-/* 
-*  Name:
-*    ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R
-*
-*  Purpose:
-*    Inverse DCT transform, de-quantization and level shift
-*
-*  Parameters:
-*    pSrc               - pointer to source
-*    pDst               - pointer to output array
-*    DstStep            - line offset for output data
-*    pEncoderQuantTable - pointer to Quantization table
-*
-*/
-
-GLOBAL(void)
-ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
-  short * pSrc,
-  unsigned char *  pDst,
-  int     DstStep,
-  const unsigned short * pQuantInvTable)
-{
-
-	__declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)];
-	Ipp16s * workbuf = (Ipp16s *)buf;	
-
-	ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable);
-	dct_8x8_inv_16s(workbuf,workbuf);
-	ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep);
-  
-} 
-
-GLOBAL(void)
-jpeg_idct_islow_sse2 (
-	j_decompress_ptr cinfo, 
-	jpeg_component_info * compptr,
-	JCOEFPTR coef_block,
-	JSAMPARRAY output_buf, 
-	JDIMENSION output_col)
-{
-	int			ctr;
-	JCOEFPTR	inptr;
-	Ipp16u*		quantptr;
-	Ipp8u*		wsptr;
-	__declspec(align(16)) Ipp8u workspace[DCTSIZE2];  	
-	JSAMPROW	outptr;
-
-	inptr = coef_block;
-	quantptr = (Ipp16u*)compptr->dct_table;
-	wsptr = workspace;
-	
-	ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr);
-
-	for(ctr = 0; ctr < DCTSIZE; ctr++)
-	{
-		outptr = output_buf[ctr] + output_col;
-
-		outptr[0] = wsptr[0];
-		outptr[1] = wsptr[1];
-		outptr[2] = wsptr[2];
-		outptr[3] = wsptr[3];
-		outptr[4] = wsptr[4];
-		outptr[5] = wsptr[5];
-		outptr[6] = wsptr[6];
-		outptr[7] = wsptr[7];
-
-		wsptr += DCTSIZE;
-	}
-}
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
-
 #endif /* DCT_ISLOW_SUPPORTED */
diff --git a/jpeg/jmemansi.c b/jpeg/jmemansi.c
deleted file mode 100644
index 2d93e496251c..000000000000
--- a/jpeg/jmemansi.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * jmemansi.c
- *
- * Copyright (C) 1992-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a simple generic implementation of the system-
- * dependent portion of the JPEG memory manager.  This implementation
- * assumes that you have the ANSI-standard library routine tmpfile().
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET		/* pre-ANSI systems may not define this; */
-#define SEEK_SET  0		/* if not, assume 0 is correct */
-#endif
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		    void FAR * buffer_address,
-		    long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFREAD(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		     void FAR * buffer_address,
-		     long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFWRITE(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  fclose(info->temp_file);
-  /* Since this implementation uses tmpfile() to create the file,
-   * no explicit file deletion is needed.
-   */
-}
-
-
-/*
- * Initial opening of a backing-store object.
- *
- * This version uses tmpfile(), which constructs a suitable file name
- * behind the scenes.  We don't have to use info->temp_name[] at all;
- * indeed, we can't even find out the actual name of the temp file.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  if ((info->temp_file = tmpfile()) == NULL)
-    ERREXITS(cinfo, JERR_TFILE_CREATE, "");
-  info->read_backing_store = read_backing_store;
-  info->write_backing_store = write_backing_store;
-  info->close_backing_store = close_backing_store;
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* no work */
-}
diff --git a/jpeg/jmemdos.c b/jpeg/jmemdos.c
deleted file mode 100644
index 60b45c693884..000000000000
--- a/jpeg/jmemdos.c
+++ /dev/null
@@ -1,638 +0,0 @@
-/*
- * jmemdos.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides an MS-DOS-compatible implementation of the system-
- * dependent portion of the JPEG memory manager.  Temporary data can be
- * stored in extended or expanded memory as well as in regular DOS files.
- *
- * If you use this file, you must be sure that NEED_FAR_POINTERS is defined
- * if you compile in a small-data memory model; it should NOT be defined if
- * you use a large-data memory model.  This file is not recommended if you
- * are using a flat-memory-space 386 environment such as DJGCC or Watcom C.
- * Also, this code will NOT work if struct fields are aligned on greater than
- * 2-byte boundaries.
- *
- * Based on code contributed by Ge' Weijers.
- */
-
-/*
- * If you have both extended and expanded memory, you may want to change the
- * order in which they are tried in jopen_backing_store.  On a 286 machine
- * expanded memory is usually faster, since extended memory access involves
- * an expensive protected-mode-and-back switch.  On 386 and better, extended
- * memory is usually faster.  As distributed, the code tries extended memory
- * first (what? not everyone has a 386? :-).
- *
- * You can disable use of extended/expanded memory entirely by altering these
- * definitions or overriding them from the Makefile (eg, -DEMS_SUPPORTED=0).
- */
-
-#ifndef XMS_SUPPORTED
-#define XMS_SUPPORTED  1
-#endif
-#ifndef EMS_SUPPORTED
-#define EMS_SUPPORTED  1
-#endif
-
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare these */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-extern char * getenv JPP((const char * name));
-#endif
-
-#ifdef NEED_FAR_POINTERS
-
-#ifdef __TURBOC__
-/* These definitions work for Borland C (Turbo C) */
-#include <alloc.h>		/* need farmalloc(), farfree() */
-#define far_malloc(x)	farmalloc(x)
-#define far_free(x)	farfree(x)
-#else
-/* These definitions work for Microsoft C and compatible compilers */
-#include <malloc.h>		/* need _fmalloc(), _ffree() */
-#define far_malloc(x)	_fmalloc(x)
-#define far_free(x)	_ffree(x)
-#endif
-
-#else /* not NEED_FAR_POINTERS */
-
-#define far_malloc(x)	malloc(x)
-#define far_free(x)	free(x)
-
-#endif /* NEED_FAR_POINTERS */
-
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#else
-#define READ_BINARY	"rb"
-#endif
-
-#ifndef USE_MSDOS_MEMMGR	/* make sure user got configuration right */
-  You forgot to define USE_MSDOS_MEMMGR in jconfig.h. /* deliberate syntax error */
-#endif
-
-#if MAX_ALLOC_CHUNK >= 65535L	/* make sure jconfig.h got this right */
-  MAX_ALLOC_CHUNK should be less than 64K. /* deliberate syntax error */
-#endif
-
-
-/*
- * Declarations for assembly-language support routines (see jmemdosa.asm).
- *
- * The functions are declared "far" as are all their pointer arguments;
- * this ensures the assembly source code will work regardless of the
- * compiler memory model.  We assume "short" is 16 bits, "long" is 32.
- */
-
-typedef void far * XMSDRIVER;	/* actually a pointer to code */
-typedef struct {		/* registers for calling XMS driver */
-	unsigned short ax, dx, bx;
-	void far * ds_si;
-      } XMScontext;
-typedef struct {		/* registers for calling EMS driver */
-	unsigned short ax, dx, bx;
-	void far * ds_si;
-      } EMScontext;
-
-extern short far jdos_open JPP((short far * handle, char far * filename));
-extern short far jdos_close JPP((short handle));
-extern short far jdos_seek JPP((short handle, long offset));
-extern short far jdos_read JPP((short handle, void far * buffer,
-				unsigned short count));
-extern short far jdos_write JPP((short handle, void far * buffer,
-				 unsigned short count));
-extern void far jxms_getdriver JPP((XMSDRIVER far *));
-extern void far jxms_calldriver JPP((XMSDRIVER, XMScontext far *));
-extern short far jems_available JPP((void));
-extern void far jems_calldriver JPP((EMScontext far *));
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is highly system-dependent, and you may want to customize it.
- */
-
-static int next_file_num;	/* to distinguish among several temp files */
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  const char * env;
-  char * ptr;
-  FILE * tfile;
-
-  /* Keep generating file names till we find one that's not in use */
-  for (;;) {
-    /* Get temp directory name from environment TMP or TEMP variable;
-     * if none, use "."
-     */
-    if ((env = (const char *) getenv("TMP")) == NULL)
-      if ((env = (const char *) getenv("TEMP")) == NULL)
-	env = ".";
-    if (*env == '\0')		/* null string means "." */
-      env = ".";
-    ptr = fname;		/* copy name to fname */
-    while (*env != '\0')
-      *ptr++ = *env++;
-    if (ptr[-1] != '\\' && ptr[-1] != '/')
-      *ptr++ = '\\';		/* append backslash if not in env variable */
-    /* Append a suitable file name */
-    next_file_num++;		/* advance counter */
-    sprintf(ptr, "JPG%03d.TMP", next_file_num);
-    /* Probe to see if file name is already in use */
-    if ((tfile = fopen(fname, READ_BINARY)) == NULL)
-      break;
-    fclose(tfile);		/* oops, it's there; close tfile & try again */
-  }
-}
-
-
-/*
- * Near-memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are allocated in far memory, if possible
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) far_malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  far_free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		300000L /* for total usage about 450K */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-/*
- * For MS-DOS we support three types of backing storage:
- *   1. Conventional DOS files.  We access these by direct DOS calls rather
- *      than via the stdio package.  This provides a bit better performance,
- *      but the real reason is that the buffers to be read or written are FAR.
- *      The stdio library for small-data memory models can't cope with that.
- *   2. Extended memory, accessed per the XMS V2.0 specification.
- *   3. Expanded memory, accessed per the LIM/EMS 4.0 specification.
- * You'll need copies of those specs to make sense of the related code.
- * The specs are available by Internet FTP from the SIMTEL archives 
- * (oak.oakland.edu and its various mirror sites).  See files
- * pub/msdos/microsoft/xms20.arc and pub/msdos/info/limems41.zip.
- */
-
-
-/*
- * Access methods for a DOS file.
- */
-
-
-METHODDEF(void)
-read_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  if (jdos_seek(info->handle.file_handle, file_offset))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
-  if (byte_count > 65535L)	/* safety check */
-    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
-  if (jdos_read(info->handle.file_handle, buffer_address,
-		(unsigned short) byte_count))
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		  void FAR * buffer_address,
-		  long file_offset, long byte_count)
-{
-  if (jdos_seek(info->handle.file_handle, file_offset))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
-  if (byte_count > 65535L)	/* safety check */
-    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
-  if (jdos_write(info->handle.file_handle, buffer_address,
-		 (unsigned short) byte_count))
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_file_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  jdos_close(info->handle.file_handle);	/* close the file */
-  remove(info->temp_name);	/* delete the file */
-/* If your system doesn't have remove(), try unlink() instead.
- * remove() is the ANSI-standard name for this function, but
- * unlink() was more common in pre-ANSI systems.
- */
-  TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-LOCAL(boolean)
-open_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		 long total_bytes_needed)
-{
-  short handle;
-
-  select_file_name(info->temp_name);
-  if (jdos_open((short far *) & handle, (char far *) info->temp_name)) {
-    /* might as well exit since jpeg_open_backing_store will fail anyway */
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-    return FALSE;
-  }
-  info->handle.file_handle = handle;
-  info->read_backing_store = read_file_store;
-  info->write_backing_store = write_file_store;
-  info->close_backing_store = close_file_store;
-  TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-  return TRUE;			/* succeeded */
-}
-
-
-/*
- * Access methods for extended memory.
- */
-
-#if XMS_SUPPORTED
-
-static XMSDRIVER xms_driver;	/* saved address of XMS driver */
-
-typedef union {			/* either long offset or real-mode pointer */
-	long offset;
-	void far * ptr;
-      } XMSPTR;
-
-typedef struct {		/* XMS move specification structure */
-	long length;
-	XMSH src_handle;
-	XMSPTR src;
-	XMSH dst_handle;
-	XMSPTR dst;
-      } XMSspec;
-
-#define ODD(X)	(((X) & 1L) != 0)
-
-
-METHODDEF(void)
-read_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		void FAR * buffer_address,
-		long file_offset, long byte_count)
-{
-  XMScontext ctx;
-  XMSspec spec;
-  char endbuffer[2];
-
-  /* The XMS driver can't cope with an odd length, so handle the last byte
-   * specially if byte_count is odd.  We don't expect this to be common.
-   */
-
-  spec.length = byte_count & (~ 1L);
-  spec.src_handle = info->handle.xms_handle;
-  spec.src.offset = file_offset;
-  spec.dst_handle = 0;
-  spec.dst.ptr = buffer_address;
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x0b00;		/* EMB move */
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    ERREXIT(cinfo, JERR_XMS_READ);
-
-  if (ODD(byte_count)) {
-    read_xms_store(cinfo, info, (void FAR *) endbuffer,
-		   file_offset + byte_count - 1L, 2L);
-    ((char FAR *) buffer_address)[byte_count - 1L] = endbuffer[0];
-  }
-}
-
-
-METHODDEF(void)
-write_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  XMScontext ctx;
-  XMSspec spec;
-  char endbuffer[2];
-
-  /* The XMS driver can't cope with an odd length, so handle the last byte
-   * specially if byte_count is odd.  We don't expect this to be common.
-   */
-
-  spec.length = byte_count & (~ 1L);
-  spec.src_handle = 0;
-  spec.src.ptr = buffer_address;
-  spec.dst_handle = info->handle.xms_handle;
-  spec.dst.offset = file_offset;
-
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x0b00;		/* EMB move */
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    ERREXIT(cinfo, JERR_XMS_WRITE);
-
-  if (ODD(byte_count)) {
-    read_xms_store(cinfo, info, (void FAR *) endbuffer,
-		   file_offset + byte_count - 1L, 2L);
-    endbuffer[0] = ((char FAR *) buffer_address)[byte_count - 1L];
-    write_xms_store(cinfo, info, (void FAR *) endbuffer,
-		    file_offset + byte_count - 1L, 2L);
-  }
-}
-
-
-METHODDEF(void)
-close_xms_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  XMScontext ctx;
-
-  ctx.dx = info->handle.xms_handle;
-  ctx.ax = 0x0a00;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  TRACEMS1(cinfo, 1, JTRC_XMS_CLOSE, info->handle.xms_handle);
-  /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		long total_bytes_needed)
-{
-  XMScontext ctx;
-
-  /* Get address of XMS driver */
-  jxms_getdriver((XMSDRIVER far *) & xms_driver);
-  if (xms_driver == NULL)
-    return FALSE;		/* no driver to be had */
-
-  /* Get version number, must be >= 2.00 */
-  ctx.ax = 0x0000;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax < (unsigned short) 0x0200)
-    return FALSE;
-
-  /* Try to get space (expressed in kilobytes) */
-  ctx.dx = (unsigned short) ((total_bytes_needed + 1023L) >> 10);
-  ctx.ax = 0x0900;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    return FALSE;
-
-  /* Succeeded, save the handle and away we go */
-  info->handle.xms_handle = ctx.dx;
-  info->read_backing_store = read_xms_store;
-  info->write_backing_store = write_xms_store;
-  info->close_backing_store = close_xms_store;
-  TRACEMS1(cinfo, 1, JTRC_XMS_OPEN, ctx.dx);
-  return TRUE;			/* succeeded */
-}
-
-#endif /* XMS_SUPPORTED */
-
-
-/*
- * Access methods for expanded memory.
- */
-
-#if EMS_SUPPORTED
-
-/* The EMS move specification structure requires word and long fields aligned
- * at odd byte boundaries.  Some compilers will align struct fields at even
- * byte boundaries.  While it's usually possible to force byte alignment,
- * that causes an overall performance penalty and may pose problems in merging
- * JPEG into a larger application.  Instead we accept some rather dirty code
- * here.  Note this code would fail if the hardware did not allow odd-byte
- * word & long accesses, but all 80x86 CPUs do.
- */
-
-typedef void far * EMSPTR;
-
-typedef union {			/* EMS move specification structure */
-	long length;		/* It's easy to access first 4 bytes */
-	char bytes[18];		/* Misaligned fields in here! */
-      } EMSspec;
-
-/* Macros for accessing misaligned fields */
-#define FIELD_AT(spec,offset,type)  (*((type *) &(spec.bytes[offset])))
-#define SRC_TYPE(spec)		FIELD_AT(spec,4,char)
-#define SRC_HANDLE(spec)	FIELD_AT(spec,5,EMSH)
-#define SRC_OFFSET(spec)	FIELD_AT(spec,7,unsigned short)
-#define SRC_PAGE(spec)		FIELD_AT(spec,9,unsigned short)
-#define SRC_PTR(spec)		FIELD_AT(spec,7,EMSPTR)
-#define DST_TYPE(spec)		FIELD_AT(spec,11,char)
-#define DST_HANDLE(spec)	FIELD_AT(spec,12,EMSH)
-#define DST_OFFSET(spec)	FIELD_AT(spec,14,unsigned short)
-#define DST_PAGE(spec)		FIELD_AT(spec,16,unsigned short)
-#define DST_PTR(spec)		FIELD_AT(spec,14,EMSPTR)
-
-#define EMSPAGESIZE	16384L	/* gospel, see the EMS specs */
-
-#define HIBYTE(W)  (((W) >> 8) & 0xFF)
-#define LOBYTE(W)  ((W) & 0xFF)
-
-
-METHODDEF(void)
-read_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		void FAR * buffer_address,
-		long file_offset, long byte_count)
-{
-  EMScontext ctx;
-  EMSspec spec;
-
-  spec.length = byte_count;
-  SRC_TYPE(spec) = 1;
-  SRC_HANDLE(spec) = info->handle.ems_handle;
-  SRC_PAGE(spec)   = (unsigned short) (file_offset / EMSPAGESIZE);
-  SRC_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
-  DST_TYPE(spec) = 0;
-  DST_HANDLE(spec) = 0;
-  DST_PTR(spec)    = buffer_address;
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x5700;		/* move memory region */
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    ERREXIT(cinfo, JERR_EMS_READ);
-}
-
-
-METHODDEF(void)
-write_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  EMScontext ctx;
-  EMSspec spec;
-
-  spec.length = byte_count;
-  SRC_TYPE(spec) = 0;
-  SRC_HANDLE(spec) = 0;
-  SRC_PTR(spec)    = buffer_address;
-  DST_TYPE(spec) = 1;
-  DST_HANDLE(spec) = info->handle.ems_handle;
-  DST_PAGE(spec)   = (unsigned short) (file_offset / EMSPAGESIZE);
-  DST_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x5700;		/* move memory region */
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    ERREXIT(cinfo, JERR_EMS_WRITE);
-}
-
-
-METHODDEF(void)
-close_ems_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  EMScontext ctx;
-
-  ctx.ax = 0x4500;
-  ctx.dx = info->handle.ems_handle;
-  jems_calldriver((EMScontext far *) & ctx);
-  TRACEMS1(cinfo, 1, JTRC_EMS_CLOSE, info->handle.ems_handle);
-  /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		long total_bytes_needed)
-{
-  EMScontext ctx;
-
-  /* Is EMS driver there? */
-  if (! jems_available())
-    return FALSE;
-
-  /* Get status, make sure EMS is OK */
-  ctx.ax = 0x4000;
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    return FALSE;
-
-  /* Get version, must be >= 4.0 */
-  ctx.ax = 0x4600;
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0 || LOBYTE(ctx.ax) < 0x40)
-    return FALSE;
-
-  /* Try to allocate requested space */
-  ctx.ax = 0x4300;
-  ctx.bx = (unsigned short) ((total_bytes_needed + EMSPAGESIZE-1L) / EMSPAGESIZE);
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    return FALSE;
-
-  /* Succeeded, save the handle and away we go */
-  info->handle.ems_handle = ctx.dx;
-  info->read_backing_store = read_ems_store;
-  info->write_backing_store = write_ems_store;
-  info->close_backing_store = close_ems_store;
-  TRACEMS1(cinfo, 1, JTRC_EMS_OPEN, ctx.dx);
-  return TRUE;			/* succeeded */
-}
-
-#endif /* EMS_SUPPORTED */
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  /* Try extended memory, then expanded memory, then regular file. */
-#if XMS_SUPPORTED
-  if (open_xms_store(cinfo, info, total_bytes_needed))
-    return;
-#endif
-#if EMS_SUPPORTED
-  if (open_ems_store(cinfo, info, total_bytes_needed))
-    return;
-#endif
-  if (open_file_store(cinfo, info, total_bytes_needed))
-    return;
-  ERREXITS(cinfo, JERR_TFILE_CREATE, "");
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  next_file_num = 0;		/* initialize temp file name generator */
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* Microsoft C, at least in v6.00A, will not successfully reclaim freed
-   * blocks of size > 32Kbytes unless we give it a kick in the rear, like so:
-   */
-#ifdef NEED_FHEAPMIN
-  _fheapmin();
-#endif
-}
diff --git a/jpeg/jmemdosa.asm b/jpeg/jmemdosa.asm
deleted file mode 100644
index ecd43729fe5e..000000000000
--- a/jpeg/jmemdosa.asm
+++ /dev/null
@@ -1,379 +0,0 @@
-;
-; jmemdosa.asm
-;
-; Copyright (C) 1992, Thomas G. Lane.
-; This file is part of the Independent JPEG Group's software.
-; For conditions of distribution and use, see the accompanying README file.
-;
-; This file contains low-level interface routines to support the MS-DOS
-; backing store manager (jmemdos.c).  Routines are provided to access disk
-; files through direct DOS calls, and to access XMS and EMS drivers.
-;
-; This file should assemble with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).  If you haven't got
-; a compatible assembler, better fall back to jmemansi.c or jmemname.c.
-;
-; To minimize dependence on the C compiler's register usage conventions,
-; we save and restore all 8086 registers, even though most compilers only
-; require SI,DI,DS to be preserved.  Also, we use only 16-bit-wide return
-; values, which everybody returns in AX.
-;
-; Based on code contributed by Ge' Weijers.
-;
-
-JMEMDOSA_TXT	segment byte public 'CODE'
-
-		assume	cs:JMEMDOSA_TXT
-
-		public	_jdos_open
-		public	_jdos_close
-		public	_jdos_seek
-		public	_jdos_read
-		public	_jdos_write
-		public	_jxms_getdriver
-		public	_jxms_calldriver
-		public	_jems_available
-		public	_jems_calldriver
-
-;
-; short far jdos_open (short far * handle, char far * filename)
-;
-; Create and open a temporary file
-;
-_jdos_open	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	cx,0			; normal file attributes
-		lds	dx,dword ptr [bp+10]	; get filename pointer
-		mov	ah,3ch			; create file
-		int	21h
-		jc	open_err		; if failed, return error code
-		lds	bx,dword ptr [bp+6]	; get handle pointer
-		mov	word ptr [bx],ax	; save the handle
-		xor	ax,ax			; return zero for OK
-open_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_open	endp
-
-
-;
-; short far jdos_close (short handle)
-;
-; Close the file handle
-;
-_jdos_close	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		mov	ah,3eh			; close file
-		int	21h
-		jc	close_err		; if failed, return error code
-		xor	ax,ax			; return zero for OK
-close_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_close	endp
-
-
-;
-; short far jdos_seek (short handle, long offset)
-;
-; Set file position
-;
-_jdos_seek	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		mov	dx,word ptr [bp+8]	; LS offset
-		mov	cx,word ptr [bp+10]	; MS offset
-		mov	ax,4200h		; absolute seek
-		int	21h
-		jc	seek_err		; if failed, return error code
-		xor	ax,ax			; return zero for OK
-seek_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_seek	endp
-
-
-;
-; short far jdos_read (short handle, void far * buffer, unsigned short count)
-;
-; Read from file
-;
-_jdos_read	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		lds	dx,dword ptr [bp+8]	; buffer address
-		mov	cx,word ptr [bp+12]	; number of bytes
-		mov	ah,3fh			; read file
-		int	21h
-		jc	read_err		; if failed, return error code
-		cmp	ax,word ptr [bp+12]	; make sure all bytes were read
-		je	read_ok
-		mov	ax,1			; else return 1 for not OK
-		jmp	short read_err
-read_ok:	xor	ax,ax			; return zero for OK
-read_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_read	endp
-
-
-;
-; short far jdos_write (short handle, void far * buffer, unsigned short count)
-;
-; Write to file
-;
-_jdos_write	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		lds	dx,dword ptr [bp+8]	; buffer address
-		mov	cx,word ptr [bp+12]	; number of bytes
-		mov	ah,40h			; write file
-		int	21h
-		jc	write_err		; if failed, return error code
-		cmp	ax,word ptr [bp+12]	; make sure all bytes written
-		je	write_ok
-		mov	ax,1			; else return 1 for not OK
-		jmp	short write_err
-write_ok:	xor	ax,ax			; return zero for OK
-write_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_write	endp
-
-
-;
-; void far jxms_getdriver (XMSDRIVER far *)
-;
-; Get the address of the XMS driver, or NULL if not available
-;
-_jxms_getdriver	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov 	ax,4300h		; call multiplex interrupt with
-		int	2fh			; a magic cookie, hex 4300
-		cmp 	al,80h			; AL should contain hex 80
-		je	xmsavail
-		xor 	dx,dx			; no XMS driver available
-		xor 	ax,ax			; return a nil pointer
-		jmp	short xmsavail_done
-xmsavail:	mov 	ax,4310h		; fetch driver address with
-		int	2fh			; another magic cookie
-		mov 	dx,es			; copy address to dx:ax
-		mov 	ax,bx
-xmsavail_done:	les 	bx,dword ptr [bp+6]	; get pointer to return value
-		mov	word ptr es:[bx],ax
-		mov	word ptr es:[bx+2],dx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop	bp
-		ret
-_jxms_getdriver	endp
-
-
-;
-; void far jxms_calldriver (XMSDRIVER, XMScontext far *)
-;
-; The XMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the XMS call is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jxms_calldriver 	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		les 	bx,dword ptr [bp+10]	; get XMScontext pointer
-		mov 	ax,word ptr es:[bx]	; load registers
-		mov 	dx,word ptr es:[bx+2]
-		mov 	si,word ptr es:[bx+6]
-		mov 	ds,word ptr es:[bx+8]
-		mov 	bx,word ptr es:[bx+4]
-		call	dword ptr [bp+6]	; call the driver
-		mov	cx,bx			; save returned BX for a sec
-		les 	bx,dword ptr [bp+10]	; get XMScontext pointer
-		mov 	word ptr es:[bx],ax	; put back ax,dx,bx
-		mov 	word ptr es:[bx+2],dx
-		mov 	word ptr es:[bx+4],cx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jxms_calldriver 	endp
-
-
-;
-; short far jems_available (void)
-;
-; Have we got an EMS driver? (this comes straight from the EMS 4.0 specs)
-;
-_jems_available	proc	far
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	ax,3567h		; get interrupt vector 67h
-		int	21h
-		push	cs
-		pop	ds
-		mov	di,000ah		; check offs 10 in returned seg
-		lea	si,ASCII_device_name	; against literal string
-		mov	cx,8
-		cld
-		repe cmpsb
-		jne	no_ems
-		mov	ax,1			; match, it's there
-		jmp	short avail_done
-no_ems:		xor	ax,ax			; it's not there
-avail_done:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		ret
-
-ASCII_device_name	db	"EMMXXXX0"
-
-_jems_available	endp
-
-
-;
-; void far jems_calldriver (EMScontext far *)
-;
-; The EMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the EMS trap is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jems_calldriver	proc far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		les 	bx,dword ptr [bp+6]	; get EMScontext pointer
-		mov 	ax,word ptr es:[bx]	; load registers
-		mov 	dx,word ptr es:[bx+2]
-		mov 	si,word ptr es:[bx+6]
-		mov 	ds,word ptr es:[bx+8]
-		mov 	bx,word ptr es:[bx+4]
-		int	67h			; call the EMS driver
-		mov	cx,bx			; save returned BX for a sec
-		les 	bx,dword ptr [bp+6]	; get EMScontext pointer
-		mov 	word ptr es:[bx],ax	; put back ax,dx,bx
-		mov 	word ptr es:[bx+2],dx
-		mov 	word ptr es:[bx+4],cx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jems_calldriver	endp
-
-JMEMDOSA_TXT	ends
-
-		end
diff --git a/jpeg/jmemmgr.c b/jpeg/jmemmgr.c
index d801b322da05..54589521a625 100644
--- a/jpeg/jmemmgr.c
+++ b/jpeg/jmemmgr.c
@@ -57,22 +57,25 @@ extern char * getenv JPP((const char * name));
  * requirement, and we had better do so too.
  * There isn't any really portable way to determine the worst-case alignment
  * requirement.  This module assumes that the alignment requirement is
- * multiples of sizeof(ALIGN_TYPE).
- * By default, we define ALIGN_TYPE as double.  This is necessary on some
+ * multiples of ALIGN_SIZE.
+ * By default, we define ALIGN_SIZE as sizeof(double).  This is necessary on some
  * workstations (where doubles really do need 8-byte alignment) and will work
  * fine on nearly everything.  If your machine has lesser alignment needs,
- * you can save a few bytes by making ALIGN_TYPE smaller.
+ * you can save a few bytes by making ALIGN_SIZE smaller.
  * The only place I know of where this will NOT work is certain Macintosh
  * 680x0 compilers that define double as a 10-byte IEEE extended float.
  * Doing 10-byte alignment is counterproductive because longwords won't be
- * aligned well.  Put "#define ALIGN_TYPE long" in jconfig.h if you have
+ * aligned well.  Put "#define ALIGN_SIZE 4" in jconfig.h if you have
  * such a compiler.
  */
 
-#ifndef ALIGN_TYPE		/* so can override from jconfig.h */
-#define ALIGN_TYPE  double
+#ifndef ALIGN_SIZE		/* so can override from jconfig.h */
+#ifndef WITH_SIMD
+#define ALIGN_SIZE  SIZEOF(double)
+#else
+#define ALIGN_SIZE  16 /* Most SIMD implementations require this */
+#endif
 #endif
-
 
 /*
  * We allocate objects from "pools", where each pool is gotten with a single
@@ -81,34 +84,24 @@ extern char * getenv JPP((const char * name));
  * header with a link to the next pool of the same class.
  * Small and large pool headers are identical except that the latter's
  * link pointer must be FAR on 80x86 machines.
- * Notice that the "real" header fields are union'ed with a dummy ALIGN_TYPE
- * field.  This forces the compiler to make SIZEOF(small_pool_hdr) a multiple
- * of the alignment requirement of ALIGN_TYPE.
  */
 
-typedef union small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct * small_pool_ptr;
 
-typedef union small_pool_struct {
-  struct {
-    small_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct small_pool_struct {
+  small_pool_ptr next;	/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
 } small_pool_hdr;
 
-typedef union large_pool_struct FAR * large_pool_ptr;
+typedef struct large_pool_struct FAR * large_pool_ptr;
 
-typedef union large_pool_struct {
-  struct {
-    large_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct large_pool_struct {
+  large_pool_ptr next;	/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
 } large_pool_hdr;
 
-
 /*
  * Here is the full definition of a memory manager object.
  */
@@ -129,7 +122,7 @@ typedef struct {
   jvirt_barray_ptr virt_barray_list;
 
   /* This counts total space obtained from jpeg_get_small/large */
-  long total_space_allocated;
+  size_t total_space_allocated;
 
   /* alloc_sarray and alloc_barray set this value for use by virtual
    * array routines.
@@ -197,16 +190,16 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 	  pool_id, mem->total_space_allocated);
 
   for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
-       lhdr_ptr = lhdr_ptr->hdr.next) {
+       lhdr_ptr = lhdr_ptr->next) {
     fprintf(stderr, "  Large chunk used %ld\n",
-	    (long) lhdr_ptr->hdr.bytes_used);
+	    (long) lhdr_ptr->bytes_used);
   }
 
   for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
-       shdr_ptr = shdr_ptr->hdr.next) {
+       shdr_ptr = shdr_ptr->next) {
     fprintf(stderr, "  Small chunk used %ld free %ld\n",
-	    (long) shdr_ptr->hdr.bytes_used,
-	    (long) shdr_ptr->hdr.bytes_left);
+	    (long) shdr_ptr->bytes_used,
+	    (long) shdr_ptr->bytes_left);
   }
 }
 
@@ -236,6 +229,10 @@ out_of_memory (j_common_ptr cinfo, int which)
  * and we also distinguish the first pool of a class from later ones.
  * NOTE: the values given work fairly well on both 16- and 32-bit-int
  * machines, but may be too small if longs are 64 bits or more.
+ *
+ * Since we do not know what alignment malloc() gives us, we have to
+ * allocate ALIGN_SIZE-1 extra space per pool to have room for alignment
+ * adjustment.
  */
 
 static const size_t first_pool_slop[JPOOL_NUMPOOLS] = 
@@ -260,33 +257,36 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   small_pool_ptr hdr_ptr, prev_hdr_ptr;
   char * data_ptr;
-  size_t odd_bytes, min_request, slop;
+  size_t min_request, slop;
+
+  /*
+   * Round up the requested size to a multiple of ALIGN_SIZE in order
+   * to assure alignment for the next object allocated in the same pool
+   * and so that algorithms can straddle outside the proper area up
+   * to the next alignment.
+   */
+  sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);
 
   /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(small_pool_hdr)))
+  if ((SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
     out_of_memory(cinfo, 1);	/* request exceeds malloc's ability */
 
-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
-  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
   /* See if space is available in any existing pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
   prev_hdr_ptr = NULL;
   hdr_ptr = mem->small_list[pool_id];
   while (hdr_ptr != NULL) {
-    if (hdr_ptr->hdr.bytes_left >= sizeofobject)
+    if (hdr_ptr->bytes_left >= sizeofobject)
       break;			/* found pool with enough space */
     prev_hdr_ptr = hdr_ptr;
-    hdr_ptr = hdr_ptr->hdr.next;
+    hdr_ptr = hdr_ptr->next;
   }
 
   /* Time to make a new pool? */
   if (hdr_ptr == NULL) {
     /* min_request is what we need now, slop is what will be leftover */
-    min_request = sizeofobject + SIZEOF(small_pool_hdr);
+    min_request = SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1;
     if (prev_hdr_ptr == NULL)	/* first pool in class? */
       slop = first_pool_slop[pool_id];
     else
@@ -305,20 +305,23 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
     }
     mem->total_space_allocated += min_request + slop;
     /* Success, initialize the new pool header and add to end of list */
-    hdr_ptr->hdr.next = NULL;
-    hdr_ptr->hdr.bytes_used = 0;
-    hdr_ptr->hdr.bytes_left = sizeofobject + slop;
+    hdr_ptr->next = NULL;
+    hdr_ptr->bytes_used = 0;
+    hdr_ptr->bytes_left = sizeofobject + slop;
     if (prev_hdr_ptr == NULL)	/* first pool in class? */
       mem->small_list[pool_id] = hdr_ptr;
     else
-      prev_hdr_ptr->hdr.next = hdr_ptr;
+      prev_hdr_ptr->next = hdr_ptr;
   }
 
   /* OK, allocate the object from the current pool */
-  data_ptr = (char *) (hdr_ptr + 1); /* point to first data byte in pool */
-  data_ptr += hdr_ptr->hdr.bytes_used; /* point to place for object */
-  hdr_ptr->hdr.bytes_used += sizeofobject;
-  hdr_ptr->hdr.bytes_left -= sizeofobject;
+  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+    data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
+  data_ptr += hdr_ptr->bytes_used; /* point to place for object */
+  hdr_ptr->bytes_used += sizeofobject;
+  hdr_ptr->bytes_left -= sizeofobject;
 
   return (void *) data_ptr;
 }
@@ -344,37 +347,45 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   large_pool_ptr hdr_ptr;
-  size_t odd_bytes;
+  char FAR * data_ptr;
+
+  /*
+   * Round up the requested size to a multiple of ALIGN_SIZE so that
+   * algorithms can straddle outside the proper area up to the next
+   * alignment.
+   */
+  sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);
 
   /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)))
+  if ((SIZEOF(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
     out_of_memory(cinfo, 3);	/* request exceeds malloc's ability */
 
-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
-  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
   /* Always make a new pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
 
   hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
-					    SIZEOF(large_pool_hdr));
+					    SIZEOF(large_pool_hdr) +
+					    ALIGN_SIZE - 1);
   if (hdr_ptr == NULL)
     out_of_memory(cinfo, 4);	/* jpeg_get_large failed */
-  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr);
+  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr) + ALIGN_SIZE - 1;
 
   /* Success, initialize the new pool header and add to list */
-  hdr_ptr->hdr.next = mem->large_list[pool_id];
+  hdr_ptr->next = mem->large_list[pool_id];
   /* We maintain space counts in each pool header for statistical purposes,
    * even though they are not needed for allocation.
    */
-  hdr_ptr->hdr.bytes_used = sizeofobject;
-  hdr_ptr->hdr.bytes_left = 0;
+  hdr_ptr->bytes_used = sizeofobject;
+  hdr_ptr->bytes_left = 0;
   mem->large_list[pool_id] = hdr_ptr;
 
-  return (void FAR *) (hdr_ptr + 1); /* point to first data byte in pool */
+  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+    data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
+
+  return (void FAR *) data_ptr;
 }
 
 
@@ -389,6 +400,10 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  * this chunking of rows.  The rowsperchunk value is left in the mem manager
  * object so that it can be saved away if this sarray is the workspace for
  * a virtual array.
+ *
+ * Since we are often upsampling with a factor 2, we align the size (not
+ * the start) to 2 * ALIGN_SIZE so that the upsampling routines don't have
+ * to be as careful about size.
  */
 
 METHODDEF(JSAMPARRAY)
@@ -402,6 +417,11 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
 
+  /* Make sure each row is properly aligned */
+  if ((ALIGN_SIZE % SIZEOF(JSAMPLE)) != 0)
+    out_of_memory(cinfo, 5);	/* safety check */
+  samplesperrow = (JDIMENSION)jround_up(samplesperrow, (2 * ALIGN_SIZE) / SIZEOF(JSAMPLE));
+
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) samplesperrow * SIZEOF(JSAMPLE));
@@ -450,6 +470,10 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
 
+  /* Make sure each row is properly aligned */
+  if ((SIZEOF(JBLOCK) % ALIGN_SIZE) != 0)
+    out_of_memory(cinfo, 6);	/* safety check */
+
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) blocksperrow * SIZEOF(JBLOCK));
@@ -584,8 +608,8 @@ realize_virt_arrays (j_common_ptr cinfo)
 /* Allocate the in-memory buffers for any unrealized virtual arrays */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
-  long space_per_minheight, maximum_space, avail_mem;
-  long minheights, max_minheights;
+  size_t space_per_minheight, maximum_space, avail_mem;
+  size_t minheights, max_minheights;
   jvirt_sarray_ptr sptr;
   jvirt_barray_ptr bptr;
 
@@ -968,9 +992,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
   mem->large_list[pool_id] = NULL;
 
   while (lhdr_ptr != NULL) {
-    large_pool_ptr next_lhdr_ptr = lhdr_ptr->hdr.next;
-    space_freed = lhdr_ptr->hdr.bytes_used +
-		  lhdr_ptr->hdr.bytes_left +
+    large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
+    space_freed = lhdr_ptr->bytes_used +
+		  lhdr_ptr->bytes_left +
 		  SIZEOF(large_pool_hdr);
     jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
@@ -982,9 +1006,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
   mem->small_list[pool_id] = NULL;
 
   while (shdr_ptr != NULL) {
-    small_pool_ptr next_shdr_ptr = shdr_ptr->hdr.next;
-    space_freed = shdr_ptr->hdr.bytes_used +
-		  shdr_ptr->hdr.bytes_left +
+    small_pool_ptr next_shdr_ptr = shdr_ptr->next;
+    space_freed = shdr_ptr->bytes_used +
+		  shdr_ptr->bytes_left +
 		  SIZEOF(small_pool_hdr);
     jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
@@ -1041,16 +1065,16 @@ jinit_memory_mgr (j_common_ptr cinfo)
    * in common if and only if X is a power of 2, ie has only one one-bit.
    * Some compilers may give an "unreachable code" warning here; ignore it.
    */
-  if ((SIZEOF(ALIGN_TYPE) & (SIZEOF(ALIGN_TYPE)-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
     ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
   /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
-   * a multiple of SIZEOF(ALIGN_TYPE).
+   * a multiple of ALIGN_SIZE.
    * Again, an "unreachable code" warning may be ignored here.
    * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
    */
   test_mac = (size_t) MAX_ALLOC_CHUNK;
   if ((long) test_mac != MAX_ALLOC_CHUNK ||
-      (MAX_ALLOC_CHUNK % SIZEOF(ALIGN_TYPE)) != 0)
+      (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
     ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
 
   max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
diff --git a/jpeg/jmemname.c b/jpeg/jmemname.c
deleted file mode 100644
index ed96dee1bc8d..000000000000
--- a/jpeg/jmemname.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * jmemname.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a generic implementation of the system-dependent
- * portion of the JPEG memory manager.  This implementation assumes that
- * you must explicitly construct a name for each temp file.
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET		/* pre-ANSI systems may not define this; */
-#define SEEK_SET  0		/* if not, assume 0 is correct */
-#endif
-
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#define RW_BINARY	"w+"
-#else
-#ifdef VMS			/* VMS is very nonstandard */
-#define READ_BINARY	"rb", "ctx=stm"
-#define RW_BINARY	"w+b", "ctx=stm"
-#else				/* standard ANSI-compliant case */
-#define READ_BINARY	"rb"
-#define RW_BINARY	"w+b"
-#endif
-#endif
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is system-dependent!
- *
- * The code as given is suitable for most Unix systems, and it is easily
- * modified for most non-Unix systems.  Some notes:
- *  1.  The temp file is created in the directory named by TEMP_DIRECTORY.
- *      The default value is /usr/tmp, which is the conventional place for
- *      creating large temp files on Unix.  On other systems you'll probably
- *      want to change the file location.  You can do this by editing the
- *      #define, or (preferred) by defining TEMP_DIRECTORY in jconfig.h.
- *
- *  2.  If you need to change the file name as well as its location,
- *      you can override the TEMP_FILE_NAME macro.  (Note that this is
- *      actually a printf format string; it must contain %s and %d.)
- *      Few people should need to do this.
- *
- *  3.  mktemp() is used to ensure that multiple processes running
- *      simultaneously won't select the same file names.  If your system
- *      doesn't have mktemp(), define NO_MKTEMP to do it the hard way.
- *      (If you don't have <errno.h>, also define NO_ERRNO_H.)
- *
- *  4.  You probably want to define NEED_SIGNAL_CATCHER so that cjpeg.c/djpeg.c
- *      will cause the temp files to be removed if you stop the program early.
- */
-
-#ifndef TEMP_DIRECTORY		/* can override from jconfig.h or Makefile */
-#define TEMP_DIRECTORY  "/usr/tmp/" /* recommended setting for Unix */
-#endif
-
-static int next_file_num;	/* to distinguish among several temp files */
-
-#ifdef NO_MKTEMP
-
-#ifndef TEMP_FILE_NAME		/* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME  "%sJPG%03d.TMP"
-#endif
-
-#ifndef NO_ERRNO_H
-#include <errno.h>		/* to define ENOENT */
-#endif
-
-/* ANSI C specifies that errno is a macro, but on older systems it's more
- * likely to be a plain int variable.  And not all versions of errno.h
- * bother to declare it, so we have to in order to be most portable.  Thus:
- */
-#ifndef errno
-extern int errno;
-#endif
-
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  FILE * tfile;
-
-  /* Keep generating file names till we find one that's not in use */
-  for (;;) {
-    next_file_num++;		/* advance counter */
-    sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
-    if ((tfile = fopen(fname, READ_BINARY)) == NULL) {
-      /* fopen could have failed for a reason other than the file not
-       * being there; for example, file there but unreadable.
-       * If <errno.h> isn't available, then we cannot test the cause.
-       */
-#ifdef ENOENT
-      if (errno != ENOENT)
-	continue;
-#endif
-      break;
-    }
-    fclose(tfile);		/* oops, it's there; close tfile & try again */
-  }
-}
-
-#else /* ! NO_MKTEMP */
-
-/* Note that mktemp() requires the initial filename to end in six X's */
-#ifndef TEMP_FILE_NAME		/* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME  "%sJPG%dXXXXXX"
-#endif
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  next_file_num++;		/* advance counter */
-  sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
-  mktemp(fname);		/* make sure file name is unique */
-  /* mktemp replaces the trailing XXXXXX with a unique string of characters */
-}
-
-#endif /* NO_MKTEMP */
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		    void FAR * buffer_address,
-		    long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFREAD(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		     void FAR * buffer_address,
-		     long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFWRITE(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  fclose(info->temp_file);	/* close the file */
-  unlink(info->temp_name);	/* delete the file */
-/* If your system doesn't have unlink(), use remove() instead.
- * remove() is the ANSI-standard name for this function, but if
- * your system was ANSI you'd be using jmemansi.c, right?
- */
-  TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  select_file_name(info->temp_name);
-  if ((info->temp_file = fopen(info->temp_name, RW_BINARY)) == NULL)
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-  info->read_backing_store = read_backing_store;
-  info->write_backing_store = write_backing_store;
-  info->close_backing_store = close_backing_store;
-  TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  next_file_num = 0;		/* initialize temp file name generator */
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* no work */
-}
diff --git a/jpeg/jmemnobs.c b/jpeg/jmemnobs.c
index eb8c337725fd..34b189563621 100644
--- a/jpeg/jmemnobs.c
+++ b/jpeg/jmemnobs.c
@@ -69,9 +69,9 @@ jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
  * Here we always say, "we got all you want bud!"
  */
 
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
+GLOBAL(size_t)
+jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
+		    size_t max_bytes_needed, size_t already_allocated)
 {
   return max_bytes_needed;
 }
diff --git a/jpeg/jmemsys.h b/jpeg/jmemsys.h
index 6c3c6d348f2c..b190945963b0 100644
--- a/jpeg/jmemsys.h
+++ b/jpeg/jmemsys.h
@@ -100,10 +100,10 @@ EXTERN(void) jpeg_free_large JPP((j_common_ptr cinfo, void FAR * object,
  * Conversely, zero may be returned to always use the minimum amount of memory.
  */
 
-EXTERN(long) jpeg_mem_available JPP((j_common_ptr cinfo,
-				     long min_bytes_needed,
-				     long max_bytes_needed,
-				     long already_allocated));
+EXTERN(size_t) jpeg_mem_available JPP((j_common_ptr cinfo,
+				     size_t min_bytes_needed,
+				     size_t max_bytes_needed,
+				     size_t already_allocated));
 
 
 /*
diff --git a/jpeg/jmorecfg.h b/jpeg/jmorecfg.h
index de6d39318400..c91399732dd3 100644
--- a/jpeg/jmorecfg.h
+++ b/jpeg/jmorecfg.h
@@ -2,6 +2,7 @@
  * jmorecfg.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -10,13 +11,7 @@
  * optimizations.  Most users will not need to touch this file.
  */
 
-/*
- * This file has been modified for the Mozilla/Netscape environment.
- * Modifications are distributed under the mozilla.org tri-license and are
- * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
- * Reserved. See http://www.mozilla.org/MPL/
- */
-
+#include "prtypes.h"
 
 /*
  * Define BITS_IN_JSAMPLE as either
@@ -69,11 +64,11 @@ typedef unsigned char JSAMPLE;
 #else /* not HAVE_UNSIGNED_CHAR */
 
 typedef char JSAMPLE;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 #define GETJSAMPLE(value)  ((int) (value))
 #else
 #define GETJSAMPLE(value)  ((int) (value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
 
 #endif /* HAVE_UNSIGNED_CHAR */
 
@@ -105,24 +100,6 @@ typedef short JSAMPLE;
 
 typedef short JCOEF;
 
-/* Defines for MMX/SSE2 support. */
-
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define HAVE_MMX_INTEL_MNEMONICS 
-
-/* SSE2 code appears broken for some cpus (bug 247437) */
-#define HAVE_SSE2_INTEL_MNEMONICS
-#define HAVE_SSE2_INTRINSICS
-#endif
-
-#if defined(__GNUC__) && defined(__i386__)
-#if defined(XP_MACOSX)
-#define HAVE_SSE2_INTRINSICS
-#endif /* ! XP_MACOSX */
-#endif /* ! GNUC && i386 */
-
-/* Add support for other platforms here */
-
 
 /* Compressed datastreams are represented as arrays of JOCTET.
  * These must be EXACTLY 8 bits wide, at least once they are written to
@@ -138,11 +115,11 @@ typedef unsigned char JOCTET;
 #else /* not HAVE_UNSIGNED_CHAR */
 
 typedef char JOCTET;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 #define GETJOCTET(value)  (value)
 #else
 #define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
 
 #endif /* HAVE_UNSIGNED_CHAR */
 
@@ -156,39 +133,19 @@ typedef char JOCTET;
 
 /* UINT8 must hold at least the values 0..255. */
 
-#ifdef HAVE_UNSIGNED_CHAR
-typedef unsigned char UINT8;
-#else /* not HAVE_UNSIGNED_CHAR */
-#ifdef CHAR_IS_UNSIGNED
-typedef char UINT8;
-#else /* not CHAR_IS_UNSIGNED */
-typedef short UINT8;
-#endif /* CHAR_IS_UNSIGNED */
-#endif /* HAVE_UNSIGNED_CHAR */
+typedef PRUint8 UINT8;
 
 /* UINT16 must hold at least the values 0..65535. */
 
-#ifdef HAVE_UNSIGNED_SHORT
-typedef unsigned short UINT16;
-#else /* not HAVE_UNSIGNED_SHORT */
-typedef unsigned int UINT16;
-#endif /* HAVE_UNSIGNED_SHORT */
+typedef PRUint16 UINT16;
 
 /* INT16 must hold at least the values -32768..32767. */
 
-#ifndef XMD_H			/* X11/xmd.h correctly defines INT16 */
-typedef short INT16;
-#endif
+typedef PRInt16 INT16;
 
 /* INT32 must hold at least signed 32-bit values. */
 
-#ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
-#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
-#ifndef _BASETSD_H
-typedef long INT32;
-#endif
-#endif
-#endif
+typedef PRInt32 INT32;
 
 /* Datatype used for image dimensions.  The JPEG standard only supports
  * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
@@ -209,21 +166,14 @@ typedef unsigned int JDIMENSION;
  * or code profilers that require it.
  */
 
-/* Mozilla mod: make external functions be DLL-able via JRI_PUBLIC_API(),
- * and supply extern "C" for C++ users of the C-compiled IJG library.
- * (Well, not anymore, but there's still a modification here.)
- */
-#include "prtypes.h"
-
 /* a function called through method pointers: */
 #define METHODDEF(type)		static type
 /* a function used only in its module: */
 #define LOCAL(type)		static type
-
-PR_BEGIN_EXTERN_C
-#define GLOBAL(type) type
-#define EXTERN(type) extern type
-PR_END_EXTERN_C
+/* a function referenced thru EXTERNs: */
+#define GLOBAL(type)		type
+/* a reference to a GLOBAL function: */
+#define EXTERN(type)		extern type
 
 
 /* This macro is used to declare a "method", that is, a function pointer.
@@ -245,13 +195,11 @@ PR_END_EXTERN_C
  * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
  */
 
-#ifndef FAR
 #ifdef NEED_FAR_POINTERS
 #define FAR  far
 #else
 #define FAR
 #endif
-#endif
 
 
 /*
@@ -261,20 +209,8 @@ PR_END_EXTERN_C
  * Defining HAVE_BOOLEAN before including jpeglib.h should make it work.
  */
 
-/* Mozilla mod: IJG distribution makes boolean = int, but on Windows
- * it's far safer to define boolean = unsigned char.  Easier to switch
- * than fight.
- */
-
-/* For some reason, on SunOS 5.3 HAVE_BOOLEAN gets defined when using
- * gcc, but boolean doesn't.  Even if you use -UHAVE_BOOLEAN, it still
- * gets reset somewhere.
- */
-#if defined(MUST_UNDEF_HAVE_BOOLEAN_AFTER_INCLUDES) && defined(HAVE_BOOLEAN)
-#undef HAVE_BOOLEAN
-#endif
 #ifndef HAVE_BOOLEAN
-typedef unsigned char boolean;
+typedef int boolean;
 #endif
 #ifndef FALSE			/* in case these macros already exist */
 #define FALSE	0		/* values of boolean */
@@ -306,22 +242,13 @@ typedef unsigned char boolean;
  * (You may HAVE to do that if your compiler doesn't like null source files.)
  */
 
-/*
- * Mozilla mods here: undef some features not actually used by the browser.
- * This reduces object code size and more importantly allows us to compile
- * even with broken compilers that crash when fed certain modules of the
- * IJG sources.  Currently we undef:
- * DCT_FLOAT_SUPPORTED INPUT_SMOOTHING_SUPPORTED IDCT_SCALING_SUPPORTED
- * QUANT_1PASS_SUPPORTED QUANT_2PASS_SUPPORTED
- */
-
 /* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */
 
 /* Capability options common to encoder and decoder: */
 
 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
-#undef  DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
-#undef  DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
+#define DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
+#define DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
 
 /* Encoder capability options: */
 
@@ -337,7 +264,7 @@ typedef unsigned char boolean;
  * The exact same statements apply for progressive JPEG: the default tables
  * don't work for progressive mode.  (This may get fixed, however.)
  */
-#undef  INPUT_SMOOTHING_SUPPORTED   /* Input image smoothing option? */
+#define INPUT_SMOOTHING_SUPPORTED   /* Input image smoothing option? */
 
 /* Decoder capability options: */
 
@@ -346,11 +273,11 @@ typedef unsigned char boolean;
 #define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
 #define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
-#undef  IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
+#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
-#undef  QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
-#undef  QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */
+#define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
+#define QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */
 
 /* more capability options later, no doubt */
 
@@ -375,32 +302,25 @@ typedef unsigned char boolean;
 #define RGB_BLUE	2	/* Offset of Blue */
 #define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */
 
+#define JPEG_NUMCS 12
 
-/* Definitions for speed-related optimizations. */
-
+static const int rgb_red[JPEG_NUMCS] = {
+	-1, -1, RGB_RED, -1, -1, -1, 0, 0, 2, 2, 3, 1
+};
 
-/* If your compiler supports inline functions, define INLINE
- * as the inline keyword; otherwise define it as empty.
- */
+static const int rgb_green[JPEG_NUMCS] = {
+	-1, -1, RGB_GREEN, -1, -1, -1, 1, 1, 1, 1, 2, 2
+};
 
-/* Mozilla mods here: add more ways of defining INLINE */
+static const int rgb_blue[JPEG_NUMCS] = {
+	-1, -1, RGB_BLUE, -1, -1, -1, 2, 2, 0, 0, 1, 3
+};
 
-#ifndef INLINE
-#ifdef __GNUC__			/* for instance, GNU C knows about inline */
-#define INLINE __inline__
-#endif
-#if defined( __IBMC__ ) || defined (__IBMCPP__)
-#define INLINE _Inline
-#endif
-#ifndef INLINE
-#ifdef __cplusplus
-#define INLINE inline		/* a C++ compiler should have it too */
-#else
-#define INLINE			/* default is to define it as empty */
-#endif
-#endif
-#endif
+static const int rgb_pixelsize[JPEG_NUMCS] = {
+	-1, -1, RGB_PIXELSIZE, -1, -1, -1, 3, 4, 3, 4, 4, 4
+};
 
+/* Definitions for speed-related optimizations. */
 
 /* On some machines (notably 68000 series) "int" is 32 bits, but multiplying
  * two 16-bit shorts is faster than multiplying two ints.  Define MULTIPLIER
@@ -408,7 +328,11 @@ typedef unsigned char boolean;
  */
 
 #ifndef MULTIPLIER
-#define MULTIPLIER  int16		/* type for fastest integer multiply */
+#ifndef WITH_SIMD
+#define MULTIPLIER  int		/* type for fastest integer multiply */
+#else
+#define MULTIPLIER short  /* prefer 16-bit with SIMD for parellelism */
+#endif
 #endif
 
 
diff --git a/jpeg/jos2fig.h b/jpeg/jos2fig.h
deleted file mode 100644
index 0bef25f2deef..000000000000
--- a/jpeg/jos2fig.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is mozilla.org code.
- *
- * The Initial Developer of the Original Code is
- * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s):
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** */
-
-#ifndef __jos2fig_h__
-#define __jos2fig_h__
-
-/*
-** Place holder for the OS/2 code that might actually make it into the trunk someday.  Maybe.
-*/
-
-#endif  /* __jos2fig_h__ */
diff --git a/jpeg/jpegcomp.h b/jpeg/jpegcomp.h
new file mode 100644
index 000000000000..1b9e0a4fac4e
--- /dev/null
+++ b/jpeg/jpegcomp.h
@@ -0,0 +1,26 @@
+/*
+ * jpegcomp.h
+ *
+ * Copyright (C) 2010, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * JPEG compatibility macros
+ * These declarations are considered internal to the JPEG library; most
+ * applications using the library shouldn't need to include this file.
+ */
+
+#if JPEG_LIB_VERSION >= 70
+#define _DCT_scaled_size DCT_h_scaled_size
+#define _min_DCT_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
+#define _jpeg_width jpeg_width
+#define _jpeg_height jpeg_height
+#else
+#define _DCT_scaled_size DCT_scaled_size
+#define _min_DCT_scaled_size min_DCT_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_scaled_size
+#define _jpeg_width image_width
+#define _jpeg_height image_height
+#endif
diff --git a/jpeg/jpegint.h b/jpeg/jpegint.h
index 95b00d405cae..3ba7be0827fd 100644
--- a/jpeg/jpegint.h
+++ b/jpeg/jpegint.h
@@ -2,6 +2,7 @@
  * jpegint.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -304,6 +305,7 @@ struct jpeg_color_quantizer {
 #define jinit_forward_dct	jIFDCT
 #define jinit_huff_encoder	jIHEncoder
 #define jinit_phuff_encoder	jIPHEncoder
+#define jinit_arith_encoder	jIAEncoder
 #define jinit_marker_writer	jIMWriter
 #define jinit_master_decompress	jIDMaster
 #define jinit_d_main_controller	jIDMainC
@@ -313,6 +315,7 @@ struct jpeg_color_quantizer {
 #define jinit_marker_reader	jIMReader
 #define jinit_huff_decoder	jIHDecoder
 #define jinit_phuff_decoder	jIPHDecoder
+#define jinit_arith_decoder	jIADecoder
 #define jinit_inverse_dct	jIIDCT
 #define jinit_upsampler		jIUpsampler
 #define jinit_color_deconverter	jIDColor
@@ -327,6 +330,7 @@ struct jpeg_color_quantizer {
 #define jzero_far		jZeroFar
 #define jpeg_zigzag_order	jZIGTable
 #define jpeg_natural_order	jZAGTable
+#define jpeg_aritab		jAriTab
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 
@@ -345,6 +349,7 @@ EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
 /* Decompression module initialization routines */
 EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
@@ -358,6 +363,7 @@ EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
@@ -369,7 +375,7 @@ EXTERN(void) jinit_memory_mgr JPP((j_common_ptr cinfo));
 
 /* Utility routines in jutils.c */
 EXTERN(long) jdiv_round_up JPP((long a, long b));
-EXTERN(long) jround_up JPP((long a, long b));
+EXTERN(size_t) jround_up JPP((size_t a, size_t b));
 EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row,
 				    JSAMPARRAY output_array, int dest_row,
 				    int num_rows, JDIMENSION num_cols));
@@ -382,6 +388,9 @@ extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
 
+/* Arithmetic coding probability estimation tables in jaricom.c */
+extern const INT32 jpeg_aritab[];
+
 /* Suppress undefined-structure complaints if necessary. */
 
 #ifdef INCOMPLETE_TYPES_BROKEN
diff --git a/jpeg/jpeglib.h b/jpeg/jpeglib.h
index 1c63ebfe9022..cb3acaf910a9 100644
--- a/jpeg/jpeglib.h
+++ b/jpeg/jpeglib.h
@@ -2,6 +2,8 @@
  * jpeglib.h
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * Copyright (C) 2009-2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -13,23 +15,6 @@
 #ifndef JPEGLIB_H
 #define JPEGLIB_H
 
-#ifdef XP_OS2
-/*
- * On OS/2, the system will have defined RGB_* so we #undef 'em to avoid warnings
- * from jmorecfg.h.
- */
-#ifdef RGB_RED
-	#undef RGB_RED
-#endif
-#ifdef RGB_GREEN
-	#undef RGB_GREEN
-#endif
-#ifdef RGB_BLUE
-	#undef RGB_BLUE
-#endif
-
-#endif
-
 /*
  * First we include the configuration files that record how this
  * installation of the JPEG library is set up.  jconfig.h can be
@@ -43,16 +28,11 @@
 #include "jmorecfg.h"		/* seldom changed options */
 
 
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-	extern int MMXAvailable;
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
 #endif
-
-
-/* Version ID for the JPEG library.
- * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
- */
-
-#define JPEG_LIB_VERSION  62	/* Version 6b */
 
 
 /* Various constants determining the sizes of things.
@@ -166,12 +146,17 @@ typedef struct {
    * Values of 1,2,4,8 are likely to be supported.  Note that different
    * components may receive different IDCT scalings.
    */
+#if JPEG_LIB_VERSION >= 70
+  int DCT_h_scaled_size;
+  int DCT_v_scaled_size;
+#else
   int DCT_scaled_size;
+#endif
   /* The downsampled dimensions are the component's actual, unpadded number
    * of samples at the main buffer (preprocessing/compression interface), thus
    * downsampled_width = ceil(image_width * Hi/Hmax)
    * and similarly for height.  For decompression, IDCT scaling is included, so
-   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_scaled_size/DCTSIZE)
+   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
    */
   JDIMENSION downsampled_width;	 /* actual width in samples */
   JDIMENSION downsampled_height; /* actual height in samples */
@@ -186,7 +171,7 @@ typedef struct {
   int MCU_width;		/* number of blocks per MCU, horizontally */
   int MCU_height;		/* number of blocks per MCU, vertically */
   int MCU_blocks;		/* MCU_width * MCU_height */
-  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_scaled_size */
+  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
   int last_col_width;		/* # of non-dummy blocks across in last MCU */
   int last_row_height;		/* # of non-dummy blocks down in last MCU */
 
@@ -225,13 +210,22 @@ struct jpeg_marker_struct {
 
 /* Known color spaces. */
 
+#define JCS_EXTENSIONS 1
+
 typedef enum {
 	JCS_UNKNOWN,		/* error/unspecified */
 	JCS_GRAYSCALE,		/* monochrome */
-	JCS_RGB,		/* red/green/blue */
+	JCS_RGB,		/* red/green/blue as specified by the RGB_RED, RGB_GREEN,
+                 RGB_BLUE, and RGB_PIXELSIZE macros */
 	JCS_YCbCr,		/* Y/Cb/Cr (also known as YUV) */
 	JCS_CMYK,		/* C/M/Y/K */
-	JCS_YCCK		/* Y/Cb/Cr/K */
+	JCS_YCCK,		/* Y/Cb/Cr/K */
+	JCS_EXT_RGB,		/* red/green/blue */
+	JCS_EXT_RGBX,		/* red/green/blue/x */
+	JCS_EXT_BGR,		/* blue/green/red */
+	JCS_EXT_BGRX,		/* blue/green/red/x */
+	JCS_EXT_XBGR,		/* x/blue/green/red */
+	JCS_EXT_XRGB		/* x/red/green/blue */
 } J_COLOR_SPACE;
 
 /* DCT/IDCT algorithm options. */
@@ -313,6 +307,19 @@ struct jpeg_compress_struct {
    * helper routines to simplify changing parameters.
    */
 
+#if JPEG_LIB_VERSION >= 70
+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  JDIMENSION jpeg_width;	/* scaled JPEG image width */
+  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  /* Dimensions of actual JPEG image that will be written to file,
+   * derived from input dimensions by scaling factors above.
+   * These fields are computed by jpeg_start_compress().
+   * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+   * in advance of calling jpeg_start_compress().
+   */
+#endif
+
   int data_precision;		/* bits of precision in image data */
 
   int num_components;		/* # of color components in JPEG image */
@@ -320,14 +327,19 @@ struct jpeg_compress_struct {
 
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
-  
+
   JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
-  /* ptrs to coefficient quantization tables, or NULL if not defined */
-  
+#if JPEG_LIB_VERSION >= 70
+  int q_scale_factor[NUM_QUANT_TBLS];
+#endif
+  /* ptrs to coefficient quantization tables, or NULL if not defined,
+   * and corresponding scale factors (percentage, initialized 100).
+   */
+
   JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
   JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
-  
+
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
@@ -343,6 +355,9 @@ struct jpeg_compress_struct {
   boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
   boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
   boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+#if JPEG_LIB_VERSION >= 70
+  boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
+#endif
   int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
   J_DCT_METHOD dct_method;	/* DCT algorithm selector */
 
@@ -386,6 +401,11 @@ struct jpeg_compress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
+#if JPEG_LIB_VERSION >= 70
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+#endif
+
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
   /* The coefficient controller receives data in units of MCU rows as defined
    * for fully interleaved scans (whether the JPEG file is interleaved or not).
@@ -411,6 +431,12 @@ struct jpeg_compress_struct {
 
   int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
 
+#if JPEG_LIB_VERSION >= 80
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order;	/* natural-order position array */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+#endif
+
   /*
    * Links to compression subobjects (methods and private variables of modules)
    */
@@ -557,6 +583,9 @@ struct jpeg_decompress_struct {
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
+#if JPEG_LIB_VERSION >= 80
+  boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
+#endif
   boolean progressive_mode;	/* TRUE if SOFn specifies progressive mode */
   boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
 
@@ -597,7 +626,12 @@ struct jpeg_decompress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
+#if JPEG_LIB_VERSION >= 70
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+#else
   int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+#endif
 
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
   /* The coefficient controller's input and output progress is measured in
@@ -605,7 +639,7 @@ struct jpeg_decompress_struct {
    * in fully interleaved JPEG scans, but are used whether the scan is
    * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
    * rows of each component.  Therefore, the IDCT output contains
-   * v_samp_factor*DCT_scaled_size sample rows of a component per iMCU row.
+   * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
    */
 
   JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@@ -629,6 +663,14 @@ struct jpeg_decompress_struct {
 
   int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
 
+#if JPEG_LIB_VERSION >= 80
+  /* These fields are derived from Se of first SOS marker.
+   */
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order; /* natural-order position array for entropy decode */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+#endif
+
   /* This field is shared between entropy decoder and marker parser.
    * It is either zero or the code of a JPEG marker that has been
    * read from the data source, but has not yet been processed.
@@ -858,11 +900,18 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_destroy_decompress	jDestDecompress
 #define jpeg_stdio_dest		jStdDest
 #define jpeg_stdio_src		jStdSrc
+#if JPEG_LIB_VERSION >= 80
+#define jpeg_mem_dest		jMemDest
+#define jpeg_mem_src		jMemSrc
+#endif
 #define jpeg_set_defaults	jSetDefaults
 #define jpeg_set_colorspace	jSetColorspace
 #define jpeg_default_colorspace	jDefColorspace
 #define jpeg_set_quality	jSetQuality
 #define jpeg_set_linear_quality	jSetLQuality
+#if JPEG_LIB_VERSION >= 70
+#define jpeg_default_qtables	jDefQTables
+#endif
 #define jpeg_add_quant_table	jAddQuantTable
 #define jpeg_quality_scaling	jQualityScaling
 #define jpeg_simple_progression	jSimProgress
@@ -872,6 +921,9 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_start_compress	jStrtCompress
 #define jpeg_write_scanlines	jWrtScanlines
 #define jpeg_finish_compress	jFinCompress
+#if JPEG_LIB_VERSION >= 70
+#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
+#endif
 #define jpeg_write_raw_data	jWrtRawData
 #define jpeg_write_marker	jWrtMarker
 #define jpeg_write_m_header	jWrtMHeader
@@ -888,6 +940,9 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_input_complete	jInComplete
 #define jpeg_new_colormap	jNewCMap
 #define jpeg_consume_input	jConsumeInput
+#if JPEG_LIB_VERSION >= 80
+#define jpeg_core_output_dimensions	jCoreDimensions
+#endif
 #define jpeg_calc_output_dimensions	jCalcDimensions
 #define jpeg_save_markers	jSaveMarkers
 #define jpeg_set_marker_processor	jSetMarker
@@ -901,9 +956,6 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_resync_to_restart	jResyncRestart
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
-#ifdef __cplusplus
-extern "C" {
-#endif
 
 /* Default error-management setup */
 EXTERN(struct jpeg_error_mgr *) jpeg_std_error
@@ -935,6 +987,16 @@ EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_stdio_dest JPP((j_compress_ptr cinfo, FILE * outfile));
 EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr cinfo, FILE * infile));
 
+#if JPEG_LIB_VERSION >= 80
+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo,
+			       unsigned char ** outbuffer,
+			       unsigned long * outsize));
+EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo,
+			      unsigned char * inbuffer,
+			      unsigned long insize));
+#endif
+
 /* Default parameter setup for compression */
 EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr cinfo));
 /* Compression parameter setup aids */
@@ -946,6 +1008,10 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
 EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
 					  int scale_factor,
 					  boolean force_baseline));
+#if JPEG_LIB_VERSION >= 70
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+				       boolean force_baseline));
+#endif
 EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
 				       const unsigned int *basic_table,
 				       int scale_factor,
@@ -965,12 +1031,17 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
 					     JDIMENSION num_lines));
 EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));
 
+#if JPEG_LIB_VERSION >= 70
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+#endif
+
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
 					    JSAMPIMAGE data,
 					    JDIMENSION num_lines));
 
-/* Write a special marker.  See libjpeg.doc concerning safe usage. */
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker
 	JPP((j_compress_ptr cinfo, int marker,
 	     const JOCTET * dataptr, unsigned int datalen));
@@ -1024,6 +1095,9 @@ EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr cinfo));
 #define JPEG_SCAN_COMPLETED	4 /* Completed last iMCU row of a scan */
 
 /* Precalculate output dimensions for current decompression parameters. */
+#if JPEG_LIB_VERSION >= 80
+EXTERN(void) jpeg_core_output_dimensions JPP((j_decompress_ptr cinfo));
+#endif
 EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr cinfo));
 
 /* Control saving of COM and APPn markers into marker_list. */
@@ -1062,9 +1136,6 @@ EXTERN(void) jpeg_destroy JPP((j_common_ptr cinfo));
 EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr cinfo,
 					    int desired));
 
-#ifdef __cplusplus
-} /* extern "C" */
-#endif /* __cplusplus */
 
 /* These marker codes are exported since applications and data source modules
  * are likely to want to use them.
@@ -1121,4 +1192,10 @@ struct jpeg_color_quantizer { long dummy; };
 #include "jerror.h"		/* fetch error codes too */
 #endif
 
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+
 #endif /* JPEGLIB_H */
diff --git a/jpeg/jquant1.c b/jpeg/jquant1.c
index b2f96aa15d25..362bb1eb2049 100644
--- a/jpeg/jquant1.c
+++ b/jpeg/jquant1.c
@@ -2,6 +2,7 @@
  * jquant1.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -193,7 +194,10 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
   int total_colors, iroot, i, j;
   boolean changed;
   long temp;
-  static const int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+  int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+  RGB_order[0] = rgb_green[cinfo->out_color_space];
+  RGB_order[1] = rgb_red[cinfo->out_color_space];
+  RGB_order[2] = rgb_blue[cinfo->out_color_space];
 
   /* We can allocate at least the nc'th root of max_colors per component. */
   /* Compute floor(nc'th root of max_colors). */
diff --git a/jpeg/jquant2.c b/jpeg/jquant2.c
index af601e334b24..da964f7d5bc2 100644
--- a/jpeg/jquant2.c
+++ b/jpeg/jquant2.c
@@ -2,6 +2,7 @@
  * jquant2.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -74,29 +75,10 @@
 #define G_SCALE 3		/* scale G distances by this much */
 #define B_SCALE 1		/* and B by this much */
 
-/* Relabel R/G/B as components 0/1/2, respecting the RGB ordering defined
- * in jmorecfg.h.  As the code stands, it will do the right thing for R,G,B
- * and B,G,R orders.  If you define some other weird order in jmorecfg.h,
- * you'll get compile errors until you extend this logic.  In that case
- * you'll probably want to tweak the histogram sizes too.
- */
-
-#if RGB_RED == 0
-#define C0_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 0
-#define C0_SCALE B_SCALE
-#endif
-#if RGB_GREEN == 1
-#define C1_SCALE G_SCALE
-#endif
-#if RGB_RED == 2
-#define C2_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 2
-#define C2_SCALE B_SCALE
-#endif
-
+static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
+#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
 
 /*
  * First we have the histogram data structure and routines for creating it.
@@ -454,15 +436,16 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
     /* We want to break any ties in favor of green, then red, blue last.
      * This code does the right thing for R,G,B or B,G,R color orders only.
      */
-#if RGB_RED == 0
-    cmax = c1; n = 1;
-    if (c0 > cmax) { cmax = c0; n = 0; }
-    if (c2 > cmax) { n = 2; }
-#else
-    cmax = c1; n = 1;
-    if (c2 > cmax) { cmax = c2; n = 2; }
-    if (c0 > cmax) { n = 0; }
-#endif
+    if (rgb_red[cinfo->out_color_space] == 0) {
+      cmax = c1; n = 1;
+      if (c0 > cmax) { cmax = c0; n = 0; }
+      if (c2 > cmax) { n = 2; }
+    }
+    else {
+      cmax = c1; n = 1;
+      if (c2 > cmax) { cmax = c2; n = 2; }
+      if (c0 > cmax) { n = 0; }
+    }
     /* Choose split point along selected axis, and update box bounds.
      * Current algorithm: split at halfway point.
      * (Since the box has been shrunk to minimum volume,
diff --git a/jpeg/jsimd.h b/jpeg/jsimd.h
new file mode 100644
index 000000000000..b6637915372f
--- /dev/null
+++ b/jpeg/jsimd.h
@@ -0,0 +1,90 @@
+/*
+ * jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_rgb_ycc                 jSCanRgbYcc
+#define jsimd_can_ycc_rgb                 jSCanYccRgb
+#define jsimd_rgb_ycc_convert             jSRgbYccConv
+#define jsimd_ycc_rgb_convert             jSYccRgbConv
+#define jsimd_can_h2v2_downsample         jSCanH2V2Down
+#define jsimd_can_h2v1_downsample         jSCanH2V1Down
+#define jsimd_h2v2_downsample             jSH2V2Down
+#define jsimd_h2v1_downsample             jSH2V1Down
+#define jsimd_can_h2v2_upsample           jSCanH2V2Up
+#define jsimd_can_h2v1_upsample           jSCanH2V1Up
+#define jsimd_h2v2_upsample               jSH2V2Up
+#define jsimd_h2v1_upsample               jSH2V1Up
+#define jsimd_can_h2v2_fancy_upsample     jSCanH2V2FUp
+#define jsimd_can_h2v1_fancy_upsample     jSCanH2V1FUp
+#define jsimd_h2v2_fancy_upsample         jSH2V2FUp
+#define jsimd_h2v1_fancy_upsample         jSH2V1FUp
+#define jsimd_can_h2v2_merged_upsample    jSCanH2V2MUp
+#define jsimd_can_h2v1_merged_upsample    jSCanH2V1MUp
+#define jsimd_h2v2_merged_upsample        jSH2V2MUp
+#define jsimd_h2v1_merged_upsample        jSH2V1MUp
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_rgb_ycc JPP((void));
+EXTERN(int) jsimd_can_ycc_rgb JPP((void));
+
+EXTERN(void) jsimd_rgb_ycc_convert
+        JPP((j_compress_ptr cinfo,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_ycc_rgb_convert
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(int) jsimd_can_h2v2_downsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_downsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_downsample
+        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample
+        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(int) jsimd_can_h2v2_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_fancy_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_fancy_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_merged_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_merged_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_merged_upsample
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+             JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+             JSAMPARRAY output_buf));
+
diff --git a/jpeg/jsimd_none.c b/jpeg/jsimd_none.c
new file mode 100644
index 000000000000..7ff30742cc99
--- /dev/null
+++ b/jpeg/jsimd_none.c
@@ -0,0 +1,300 @@
+/*
+ * jsimd_none.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains stubs for when there is no SIMD support available.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+#include "jdct.h"
+#include "jsimddct.h"
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
diff --git a/jpeg/jsimddct.h b/jpeg/jsimddct.h
new file mode 100644
index 000000000000..a1c74407ea33
--- /dev/null
+++ b/jpeg/jsimddct.h
@@ -0,0 +1,102 @@
+/*
+ * jsimddct.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_convsamp                jSCanConv
+#define jsimd_can_convsamp_float          jSCanConvF
+#define jsimd_convsamp                    jSConv
+#define jsimd_convsamp_float              jSConvF
+#define jsimd_can_fdct_islow              jSCanFDCTIS
+#define jsimd_can_fdct_ifast              jSCanFDCTIF
+#define jsimd_can_fdct_float              jSCanFDCTFl
+#define jsimd_fdct_islow                  jSFDCTIS
+#define jsimd_fdct_ifast                  jSFDCTIF
+#define jsimd_fdct_float                  jSFDCTFl
+#define jsimd_can_quantize                jSCanQuant
+#define jsimd_can_quantize_float          jSCanQuantF
+#define jsimd_quantize                    jSQuant
+#define jsimd_quantize_float              jSQuantF
+#define jsimd_can_idct_2x2                jSCanIDCT22
+#define jsimd_can_idct_4x4                jSCanIDCT44
+#define jsimd_idct_2x2                    jSIDCT22
+#define jsimd_idct_4x4                    jSIDCT44
+#define jsimd_can_idct_islow              jSCanIDCTIS
+#define jsimd_can_idct_ifast              jSCanIDCTIF
+#define jsimd_can_idct_float              jSCanIDCTFl
+#define jsimd_idct_islow                  jSIDCTIS
+#define jsimd_idct_ifast                  jSIDCTIF
+#define jsimd_idct_float                  jSIDCTFl
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_convsamp JPP((void));
+EXTERN(int) jsimd_can_convsamp_float JPP((void));
+
+EXTERN(void) jsimd_convsamp JPP((JSAMPARRAY sample_data,
+                                 JDIMENSION start_col,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_convsamp_float JPP((JSAMPARRAY sample_data,
+                                       JDIMENSION start_col,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_fdct_islow JPP((void));
+EXTERN(int) jsimd_can_fdct_ifast JPP((void));
+EXTERN(int) jsimd_can_fdct_float JPP((void));
+
+EXTERN(void) jsimd_fdct_islow JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_float JPP((FAST_FLOAT * data));
+
+EXTERN(int) jsimd_can_quantize JPP((void));
+EXTERN(int) jsimd_can_quantize_float JPP((void));
+
+EXTERN(void) jsimd_quantize JPP((JCOEFPTR coef_block,
+                                 DCTELEM * divisors,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_float JPP((JCOEFPTR coef_block,
+                                       FAST_FLOAT * divisors,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_idct_2x2 JPP((void));
+EXTERN(int) jsimd_can_idct_4x4 JPP((void));
+
+EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+
+EXTERN(int) jsimd_can_idct_islow JPP((void));
+EXTERN(int) jsimd_can_idct_ifast JPP((void));
+EXTERN(int) jsimd_can_idct_float JPP((void));
+
+EXTERN(void) jsimd_idct_islow JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_float JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+
diff --git a/jpeg/jutils.c b/jpeg/jutils.c
index d18a9555621b..98b54f5f107a 100644
--- a/jpeg/jutils.c
+++ b/jpeg/jutils.c
@@ -77,8 +77,8 @@ jdiv_round_up (long a, long b)
 }
 
 
-GLOBAL(long)
-jround_up (long a, long b)
+GLOBAL(size_t)
+jround_up (size_t a, size_t b)
 /* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */
 /* Assumes a >= 0, b > 0 */
 {
diff --git a/jpeg/jversion.h b/jpeg/jversion.h
index 6472c58d351a..119c481c3f26 100644
--- a/jpeg/jversion.h
+++ b/jpeg/jversion.h
@@ -1,7 +1,8 @@
 /*
  * jversion.h
  *
- * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 2010, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -9,6 +10,28 @@
  */
 
 
+#if JPEG_LIB_VERSION >= 80
+
+#define JVERSION	"8b  16-May-2010"
+
+#define JCOPYRIGHT	"Copyright (C) 2010, Thomas G. Lane, Guido Vollbeding"
+
+#elif JPEG_LIB_VERSION >= 70
+
+#define JVERSION        "7  27-Jun-2009"
+
+#define JCOPYRIGHT      "Copyright (C) 2009, Thomas G. Lane, Guido Vollbeding"
+
+#else
+
 #define JVERSION	"6b  27-Mar-1998"
 
 #define JCOPYRIGHT	"Copyright (C) 1998, Thomas G. Lane"
+
+#endif
+
+#define LJTCOPYRIGHT	"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+			"Copyright (C) 2004 Landmark Graphics Corporation\n" \
+			"Copyright (C) 2005-2007 Sun Microsystems, Inc.\n" \
+			"Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
+			"Copyright (C) 2009-2011 D. R. Commander"
diff --git a/jpeg/jwinfig.h b/jpeg/jwinfig.h
deleted file mode 100644
index a558735b77a2..000000000000
--- a/jpeg/jwinfig.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* jconfig.mc6 --- jconfig.h for Microsoft C on MS-DOS, version 6.00A & up. */
-/* see jconfig.doc for explanations */
-
-/* this is a hack */
-#define HAVE_BOOLEAN
-#ifndef __RPCNDR_H__
-typedef unsigned char boolean;
-#endif
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS	/* for small or medium memory model */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define USE_MSDOS_MEMANSI
-
-#define MAX_ALLOC_CHUNK 65520L	/* Maximum request to malloc() */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define USE_SETMODE		/* Microsoft has setmode() */
-#define NEED_SIGNAL_CATCHER	/* Define this if you use jmemdos.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jpeg/libjpeg.doc b/jpeg/libjpeg.doc
deleted file mode 100644
index 689b206c07fd..000000000000
--- a/jpeg/libjpeg.doc
+++ /dev/null
@@ -1,3006 +0,0 @@
-USING THE IJG JPEG LIBRARY
-
-Copyright (C) 1994-1998, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-This file describes how to use the IJG JPEG library within an application
-program.  Read it if you want to write a program that uses the library.
-
-The file example.c provides heavily commented skeleton code for calling the
-JPEG library.  Also see jpeglib.h (the include file to be used by application
-programs) for full details about data structures and function parameter lists.
-The library source code, of course, is the ultimate reference.
-
-Note that there have been *major* changes from the application interface
-presented by IJG version 4 and earlier versions.  The old design had several
-inherent limitations, and it had accumulated a lot of cruft as we added
-features while trying to minimize application-interface changes.  We have
-sacrificed backward compatibility in the version 5 rewrite, but we think the
-improvements justify this.
-
-
-TABLE OF CONTENTS
------------------
-
-Overview:
-	Functions provided by the library
-	Outline of typical usage
-Basic library usage:
-	Data formats
-	Compression details
-	Decompression details
-	Mechanics of usage: include files, linking, etc
-Advanced features:
-	Compression parameter selection
-	Decompression parameter selection
-	Special color spaces
-	Error handling
-	Compressed data handling (source and destination managers)
-	I/O suspension
-	Progressive JPEG support
-	Buffered-image mode
-	Abbreviated datastreams and multiple images
-	Special markers
-	Raw (downsampled) image data
-	Really raw data: DCT coefficients
-	Progress monitoring
-	Memory management
-	Memory usage
-	Library compile-time options
-	Portability considerations
-	Notes for MS-DOS implementors
-
-You should read at least the overview and basic usage sections before trying
-to program with the library.  The sections on advanced features can be read
-if and when you need them.
-
-
-OVERVIEW
-========
-
-Functions provided by the library
----------------------------------
-
-The IJG JPEG library provides C code to read and write JPEG-compressed image
-files.  The surrounding application program receives or supplies image data a
-scanline at a time, using a straightforward uncompressed image format.  All
-details of color conversion and other preprocessing/postprocessing can be
-handled by the library.
-
-The library includes a substantial amount of code that is not covered by the
-JPEG standard but is necessary for typical applications of JPEG.  These
-functions preprocess the image before JPEG compression or postprocess it after
-decompression.  They include colorspace conversion, downsampling/upsampling,
-and color quantization.  The application indirectly selects use of this code
-by specifying the format in which it wishes to supply or receive image data.
-For example, if colormapped output is requested, then the decompression
-library automatically invokes color quantization.
-
-A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
-and even more so in decompression postprocessing.  The decompression library
-provides multiple implementations that cover most of the useful tradeoffs,
-ranging from very-high-quality down to fast-preview operation.  On the
-compression side we have generally not provided low-quality choices, since
-compression is normally less time-critical.  It should be understood that the
-low-quality modes may not meet the JPEG standard's accuracy requirements;
-nonetheless, they are useful for viewers.
-
-A word about functions *not* provided by the library.  We handle a subset of
-the ISO JPEG standard; most baseline, extended-sequential, and progressive
-JPEG processes are supported.  (Our subset includes all features now in common
-use.)  Unsupported ISO options include:
-	* Hierarchical storage
-	* Lossless JPEG
-	* Arithmetic entropy coding (unsupported for legal reasons)
-	* DNL marker
-	* Nonintegral subsampling ratios
-We support both 8- and 12-bit data precision, but this is a compile-time
-choice rather than a run-time choice; hence it is difficult to use both
-precisions in a single application.
-
-By itself, the library handles only interchange JPEG datastreams --- in
-particular the widely used JFIF file format.  The library can be used by
-surrounding code to process interchange or abbreviated JPEG datastreams that
-are embedded in more complex file formats.  (For example, this library is
-used by the free LIBTIFF library to support JPEG compression in TIFF.)
-
-
-Outline of typical usage
-------------------------
-
-The rough outline of a JPEG compression operation is:
-
-	Allocate and initialize a JPEG compression object
-	Specify the destination for the compressed data (eg, a file)
-	Set parameters for compression, including image size & colorspace
-	jpeg_start_compress(...);
-	while (scan lines remain to be written)
-		jpeg_write_scanlines(...);
-	jpeg_finish_compress(...);
-	Release the JPEG compression object
-
-A JPEG compression object holds parameters and working state for the JPEG
-library.  We make creation/destruction of the object separate from starting
-or finishing compression of an image; the same object can be re-used for a
-series of image compression operations.  This makes it easy to re-use the
-same parameter settings for a sequence of images.  Re-use of a JPEG object
-also has important implications for processing abbreviated JPEG datastreams,
-as discussed later.
-
-The image data to be compressed is supplied to jpeg_write_scanlines() from
-in-memory buffers.  If the application is doing file-to-file compression,
-reading image data from the source file is the application's responsibility.
-The library emits compressed data by calling a "data destination manager",
-which typically will write the data into a file; but the application can
-provide its own destination manager to do something else.
-
-Similarly, the rough outline of a JPEG decompression operation is:
-
-	Allocate and initialize a JPEG decompression object
-	Specify the source of the compressed data (eg, a file)
-	Call jpeg_read_header() to obtain image info
-	Set parameters for decompression
-	jpeg_start_decompress(...);
-	while (scan lines remain to be read)
-		jpeg_read_scanlines(...);
-	jpeg_finish_decompress(...);
-	Release the JPEG decompression object
-
-This is comparable to the compression outline except that reading the
-datastream header is a separate step.  This is helpful because information
-about the image's size, colorspace, etc is available when the application
-selects decompression parameters.  For example, the application can choose an
-output scaling ratio that will fit the image into the available screen size.
-
-The decompression library obtains compressed data by calling a data source
-manager, which typically will read the data from a file; but other behaviors
-can be obtained with a custom source manager.  Decompressed data is delivered
-into in-memory buffers passed to jpeg_read_scanlines().
-
-It is possible to abort an incomplete compression or decompression operation
-by calling jpeg_abort(); or, if you do not need to retain the JPEG object,
-simply release it by calling jpeg_destroy().
-
-JPEG compression and decompression objects are two separate struct types.
-However, they share some common fields, and certain routines such as
-jpeg_destroy() can work on either type of object.
-
-The JPEG library has no static variables: all state is in the compression
-or decompression object.  Therefore it is possible to process multiple
-compression and decompression operations concurrently, using multiple JPEG
-objects.
-
-Both compression and decompression can be done in an incremental memory-to-
-memory fashion, if suitable source/destination managers are used.  See the
-section on "I/O suspension" for more details.
-
-
-BASIC LIBRARY USAGE
-===================
-
-Data formats
-------------
-
-Before diving into procedural details, it is helpful to understand the
-image data format that the JPEG library expects or returns.
-
-The standard input image format is a rectangular array of pixels, with each
-pixel having the same number of "component" or "sample" values (color
-channels).  You must specify how many components there are and the colorspace
-interpretation of the components.  Most applications will use RGB data
-(three components per pixel) or grayscale data (one component per pixel).
-PLEASE NOTE THAT RGB DATA IS THREE SAMPLES PER PIXEL, GRAYSCALE ONLY ONE.
-A remarkable number of people manage to miss this, only to find that their
-programs don't work with grayscale JPEG files.
-
-There is no provision for colormapped input.  JPEG files are always full-color
-or full grayscale (or sometimes another colorspace such as CMYK).  You can
-feed in a colormapped image by expanding it to full-color format.  However
-JPEG often doesn't work very well with source data that has been colormapped,
-because of dithering noise.  This is discussed in more detail in the JPEG FAQ
-and the other references mentioned in the README file.
-
-Pixels are stored by scanlines, with each scanline running from left to
-right.  The component values for each pixel are adjacent in the row; for
-example, R,G,B,R,G,B,R,G,B,... for 24-bit RGB color.  Each scanline is an
-array of data type JSAMPLE --- which is typically "unsigned char", unless
-you've changed jmorecfg.h.  (You can also change the RGB pixel layout, say
-to B,G,R order, by modifying jmorecfg.h.  But see the restrictions listed in
-that file before doing so.)
-
-A 2-D array of pixels is formed by making a list of pointers to the starts of
-scanlines; so the scanlines need not be physically adjacent in memory.  Even
-if you process just one scanline at a time, you must make a one-element
-pointer array to conform to this structure.  Pointers to JSAMPLE rows are of
-type JSAMPROW, and the pointer to the pointer array is of type JSAMPARRAY.
-
-The library accepts or supplies one or more complete scanlines per call.
-It is not possible to process part of a row at a time.  Scanlines are always
-processed top-to-bottom.  You can process an entire image in one call if you
-have it all in memory, but usually it's simplest to process one scanline at
-a time.
-
-For best results, source data values should have the precision specified by
-BITS_IN_JSAMPLE (normally 8 bits).  For instance, if you choose to compress
-data that's only 6 bits/channel, you should left-justify each value in a
-byte before passing it to the compressor.  If you need to compress data
-that has more than 8 bits/channel, compile with BITS_IN_JSAMPLE = 12.
-(See "Library compile-time options", later.)
-
-
-The data format returned by the decompressor is the same in all details,
-except that colormapped output is supported.  (Again, a JPEG file is never
-colormapped.  But you can ask the decompressor to perform on-the-fly color
-quantization to deliver colormapped output.)  If you request colormapped
-output then the returned data array contains a single JSAMPLE per pixel;
-its value is an index into a color map.  The color map is represented as
-a 2-D JSAMPARRAY in which each row holds the values of one color component,
-that is, colormap[i][j] is the value of the i'th color component for pixel
-value (map index) j.  Note that since the colormap indexes are stored in
-JSAMPLEs, the maximum number of colors is limited by the size of JSAMPLE
-(ie, at most 256 colors for an 8-bit JPEG library).
-
-
-Compression details
--------------------
-
-Here we revisit the JPEG compression outline given in the overview.
-
-1. Allocate and initialize a JPEG compression object.
-
-A JPEG compression object is a "struct jpeg_compress_struct".  (It also has
-a bunch of subsidiary structures which are allocated via malloc(), but the
-application doesn't control those directly.)  This struct can be just a local
-variable in the calling routine, if a single routine is going to execute the
-whole JPEG compression sequence.  Otherwise it can be static or allocated
-from malloc().
-
-You will also need a structure representing a JPEG error handler.  The part
-of this that the library cares about is a "struct jpeg_error_mgr".  If you
-are providing your own error handler, you'll typically want to embed the
-jpeg_error_mgr struct in a larger structure; this is discussed later under
-"Error handling".  For now we'll assume you are just using the default error
-handler.  The default error handler will print JPEG error/warning messages
-on stderr, and it will call exit() if a fatal error occurs.
-
-You must initialize the error handler structure, store a pointer to it into
-the JPEG object's "err" field, and then call jpeg_create_compress() to
-initialize the rest of the JPEG object.
-
-Typical code for this step, if you are using the default error handler, is
-
-	struct jpeg_compress_struct cinfo;
-	struct jpeg_error_mgr jerr;
-	...
-	cinfo.err = jpeg_std_error(&jerr);
-	jpeg_create_compress(&cinfo);
-
-jpeg_create_compress allocates a small amount of memory, so it could fail
-if you are out of memory.  In that case it will exit via the error handler;
-that's why the error handler must be initialized first.
-
-
-2. Specify the destination for the compressed data (eg, a file).
-
-As previously mentioned, the JPEG library delivers compressed data to a
-"data destination" module.  The library includes one data destination
-module which knows how to write to a stdio stream.  You can use your own
-destination module if you want to do something else, as discussed later.
-
-If you use the standard destination module, you must open the target stdio
-stream beforehand.  Typical code for this step looks like:
-
-	FILE * outfile;
-	...
-	if ((outfile = fopen(filename, "wb")) == NULL) {
-	    fprintf(stderr, "can't open %s\n", filename);
-	    exit(1);
-	}
-	jpeg_stdio_dest(&cinfo, outfile);
-
-where the last line invokes the standard destination module.
-
-WARNING: it is critical that the binary compressed data be delivered to the
-output file unchanged.  On non-Unix systems the stdio library may perform
-newline translation or otherwise corrupt binary data.  To suppress this
-behavior, you may need to use a "b" option to fopen (as shown above), or use
-setmode() or another routine to put the stdio stream in binary mode.  See
-cjpeg.c and djpeg.c for code that has been found to work on many systems.
-
-You can select the data destination after setting other parameters (step 3),
-if that's more convenient.  You may not change the destination between
-calling jpeg_start_compress() and jpeg_finish_compress().
-
-
-3. Set parameters for compression, including image size & colorspace.
-
-You must supply information about the source image by setting the following
-fields in the JPEG object (cinfo structure):
-
-	image_width		Width of image, in pixels
-	image_height		Height of image, in pixels
-	input_components	Number of color channels (samples per pixel)
-	in_color_space		Color space of source image
-
-The image dimensions are, hopefully, obvious.  JPEG supports image dimensions
-of 1 to 64K pixels in either direction.  The input color space is typically
-RGB or grayscale, and input_components is 3 or 1 accordingly.  (See "Special
-color spaces", later, for more info.)  The in_color_space field must be
-assigned one of the J_COLOR_SPACE enum constants, typically JCS_RGB or
-JCS_GRAYSCALE.
-
-JPEG has a large number of compression parameters that determine how the
-image is encoded.  Most applications don't need or want to know about all
-these parameters.  You can set all the parameters to reasonable defaults by
-calling jpeg_set_defaults(); then, if there are particular values you want
-to change, you can do so after that.  The "Compression parameter selection"
-section tells about all the parameters.
-
-You must set in_color_space correctly before calling jpeg_set_defaults(),
-because the defaults depend on the source image colorspace.  However the
-other three source image parameters need not be valid until you call
-jpeg_start_compress().  There's no harm in calling jpeg_set_defaults() more
-than once, if that happens to be convenient.
-
-Typical code for a 24-bit RGB source image is
-
-	cinfo.image_width = Width; 	/* image width and height, in pixels */
-	cinfo.image_height = Height;
-	cinfo.input_components = 3;	/* # of color components per pixel */
-	cinfo.in_color_space = JCS_RGB; /* colorspace of input image */
-
-	jpeg_set_defaults(&cinfo);
-	/* Make optional parameter settings here */
-
-
-4. jpeg_start_compress(...);
-
-After you have established the data destination and set all the necessary
-source image info and other parameters, call jpeg_start_compress() to begin
-a compression cycle.  This will initialize internal state, allocate working
-storage, and emit the first few bytes of the JPEG datastream header.
-
-Typical code:
-
-	jpeg_start_compress(&cinfo, TRUE);
-
-The "TRUE" parameter ensures that a complete JPEG interchange datastream
-will be written.  This is appropriate in most cases.  If you think you might
-want to use an abbreviated datastream, read the section on abbreviated
-datastreams, below.
-
-Once you have called jpeg_start_compress(), you may not alter any JPEG
-parameters or other fields of the JPEG object until you have completed
-the compression cycle.
-
-
-5. while (scan lines remain to be written)
-	jpeg_write_scanlines(...);
-
-Now write all the required image data by calling jpeg_write_scanlines()
-one or more times.  You can pass one or more scanlines in each call, up
-to the total image height.  In most applications it is convenient to pass
-just one or a few scanlines at a time.  The expected format for the passed
-data is discussed under "Data formats", above.
-
-Image data should be written in top-to-bottom scanline order.  The JPEG spec
-contains some weasel wording about how top and bottom are application-defined
-terms (a curious interpretation of the English language...) but if you want
-your files to be compatible with everyone else's, you WILL use top-to-bottom
-order.  If the source data must be read in bottom-to-top order, you can use
-the JPEG library's virtual array mechanism to invert the data efficiently.
-Examples of this can be found in the sample application cjpeg.
-
-The library maintains a count of the number of scanlines written so far
-in the next_scanline field of the JPEG object.  Usually you can just use
-this variable as the loop counter, so that the loop test looks like
-"while (cinfo.next_scanline < cinfo.image_height)".
-
-Code for this step depends heavily on the way that you store the source data.
-example.c shows the following code for the case of a full-size 2-D source
-array containing 3-byte RGB pixels:
-
-	JSAMPROW row_pointer[1];	/* pointer to a single row */
-	int row_stride;			/* physical row width in buffer */
-
-	row_stride = image_width * 3;	/* JSAMPLEs per row in image_buffer */
-
-	while (cinfo.next_scanline < cinfo.image_height) {
-	    row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
-	    jpeg_write_scanlines(&cinfo, row_pointer, 1);
-	}
-
-jpeg_write_scanlines() returns the number of scanlines actually written.
-This will normally be equal to the number passed in, so you can usually
-ignore the return value.  It is different in just two cases:
-  * If you try to write more scanlines than the declared image height,
-    the additional scanlines are ignored.
-  * If you use a suspending data destination manager, output buffer overrun
-    will cause the compressor to return before accepting all the passed lines.
-    This feature is discussed under "I/O suspension", below.  The normal
-    stdio destination manager will NOT cause this to happen.
-In any case, the return value is the same as the change in the value of
-next_scanline.
-
-
-6. jpeg_finish_compress(...);
-
-After all the image data has been written, call jpeg_finish_compress() to
-complete the compression cycle.  This step is ESSENTIAL to ensure that the
-last bufferload of data is written to the data destination.
-jpeg_finish_compress() also releases working memory associated with the JPEG
-object.
-
-Typical code:
-
-	jpeg_finish_compress(&cinfo);
-
-If using the stdio destination manager, don't forget to close the output
-stdio stream (if necessary) afterwards.
-
-If you have requested a multi-pass operating mode, such as Huffman code
-optimization, jpeg_finish_compress() will perform the additional passes using
-data buffered by the first pass.  In this case jpeg_finish_compress() may take
-quite a while to complete.  With the default compression parameters, this will
-not happen.
-
-It is an error to call jpeg_finish_compress() before writing the necessary
-total number of scanlines.  If you wish to abort compression, call
-jpeg_abort() as discussed below.
-
-After completing a compression cycle, you may dispose of the JPEG object
-as discussed next, or you may use it to compress another image.  In that case
-return to step 2, 3, or 4 as appropriate.  If you do not change the
-destination manager, the new datastream will be written to the same target.
-If you do not change any JPEG parameters, the new datastream will be written
-with the same parameters as before.  Note that you can change the input image
-dimensions freely between cycles, but if you change the input colorspace, you
-should call jpeg_set_defaults() to adjust for the new colorspace; and then
-you'll need to repeat all of step 3.
-
-
-7. Release the JPEG compression object.
-
-When you are done with a JPEG compression object, destroy it by calling
-jpeg_destroy_compress().  This will free all subsidiary memory (regardless of
-the previous state of the object).  Or you can call jpeg_destroy(), which
-works for either compression or decompression objects --- this may be more
-convenient if you are sharing code between compression and decompression
-cases.  (Actually, these routines are equivalent except for the declared type
-of the passed pointer.  To avoid gripes from ANSI C compilers, jpeg_destroy()
-should be passed a j_common_ptr.)
-
-If you allocated the jpeg_compress_struct structure from malloc(), freeing
-it is your responsibility --- jpeg_destroy() won't.  Ditto for the error
-handler structure.
-
-Typical code:
-
-	jpeg_destroy_compress(&cinfo);
-
-
-8. Aborting.
-
-If you decide to abort a compression cycle before finishing, you can clean up
-in either of two ways:
-
-* If you don't need the JPEG object any more, just call
-  jpeg_destroy_compress() or jpeg_destroy() to release memory.  This is
-  legitimate at any point after calling jpeg_create_compress() --- in fact,
-  it's safe even if jpeg_create_compress() fails.
-
-* If you want to re-use the JPEG object, call jpeg_abort_compress(), or call
-  jpeg_abort() which works on both compression and decompression objects.
-  This will return the object to an idle state, releasing any working memory.
-  jpeg_abort() is allowed at any time after successful object creation.
-
-Note that cleaning up the data destination, if required, is your
-responsibility; neither of these routines will call term_destination().
-(See "Compressed data handling", below, for more about that.)
-
-jpeg_destroy() and jpeg_abort() are the only safe calls to make on a JPEG
-object that has reported an error by calling error_exit (see "Error handling"
-for more info).  The internal state of such an object is likely to be out of
-whack.  Either of these two routines will return the object to a known state.
-
-
-Decompression details
----------------------
-
-Here we revisit the JPEG decompression outline given in the overview.
-
-1. Allocate and initialize a JPEG decompression object.
-
-This is just like initialization for compression, as discussed above,
-except that the object is a "struct jpeg_decompress_struct" and you
-call jpeg_create_decompress().  Error handling is exactly the same.
-
-Typical code:
-
-	struct jpeg_decompress_struct cinfo;
-	struct jpeg_error_mgr jerr;
-	...
-	cinfo.err = jpeg_std_error(&jerr);
-	jpeg_create_decompress(&cinfo);
-
-(Both here and in the IJG code, we usually use variable name "cinfo" for
-both compression and decompression objects.)
-
-
-2. Specify the source of the compressed data (eg, a file).
-
-As previously mentioned, the JPEG library reads compressed data from a "data
-source" module.  The library includes one data source module which knows how
-to read from a stdio stream.  You can use your own source module if you want
-to do something else, as discussed later.
-
-If you use the standard source module, you must open the source stdio stream
-beforehand.  Typical code for this step looks like:
-
-	FILE * infile;
-	...
-	if ((infile = fopen(filename, "rb")) == NULL) {
-	    fprintf(stderr, "can't open %s\n", filename);
-	    exit(1);
-	}
-	jpeg_stdio_src(&cinfo, infile);
-
-where the last line invokes the standard source module.
-
-WARNING: it is critical that the binary compressed data be read unchanged.
-On non-Unix systems the stdio library may perform newline translation or
-otherwise corrupt binary data.  To suppress this behavior, you may need to use
-a "b" option to fopen (as shown above), or use setmode() or another routine to
-put the stdio stream in binary mode.  See cjpeg.c and djpeg.c for code that
-has been found to work on many systems.
-
-You may not change the data source between calling jpeg_read_header() and
-jpeg_finish_decompress().  If you wish to read a series of JPEG images from
-a single source file, you should repeat the jpeg_read_header() to
-jpeg_finish_decompress() sequence without reinitializing either the JPEG
-object or the data source module; this prevents buffered input data from
-being discarded.
-
-
-3. Call jpeg_read_header() to obtain image info.
-
-Typical code for this step is just
-
-	jpeg_read_header(&cinfo, TRUE);
-
-This will read the source datastream header markers, up to the beginning
-of the compressed data proper.  On return, the image dimensions and other
-info have been stored in the JPEG object.  The application may wish to
-consult this information before selecting decompression parameters.
-
-More complex code is necessary if
-  * A suspending data source is used --- in that case jpeg_read_header()
-    may return before it has read all the header data.  See "I/O suspension",
-    below.  The normal stdio source manager will NOT cause this to happen.
-  * Abbreviated JPEG files are to be processed --- see the section on
-    abbreviated datastreams.  Standard applications that deal only in
-    interchange JPEG files need not be concerned with this case either.
-
-It is permissible to stop at this point if you just wanted to find out the
-image dimensions and other header info for a JPEG file.  In that case,
-call jpeg_destroy() when you are done with the JPEG object, or call
-jpeg_abort() to return it to an idle state before selecting a new data
-source and reading another header.
-
-
-4. Set parameters for decompression.
-
-jpeg_read_header() sets appropriate default decompression parameters based on
-the properties of the image (in particular, its colorspace).  However, you
-may well want to alter these defaults before beginning the decompression.
-For example, the default is to produce full color output from a color file.
-If you want colormapped output you must ask for it.  Other options allow the
-returned image to be scaled and allow various speed/quality tradeoffs to be
-selected.  "Decompression parameter selection", below, gives details.
-
-If the defaults are appropriate, nothing need be done at this step.
-
-Note that all default values are set by each call to jpeg_read_header().
-If you reuse a decompression object, you cannot expect your parameter
-settings to be preserved across cycles, as you can for compression.
-You must set desired parameter values each time.
-
-
-5. jpeg_start_decompress(...);
-
-Once the parameter values are satisfactory, call jpeg_start_decompress() to
-begin decompression.  This will initialize internal state, allocate working
-memory, and prepare for returning data.
-
-Typical code is just
-
-	jpeg_start_decompress(&cinfo);
-
-If you have requested a multi-pass operating mode, such as 2-pass color
-quantization, jpeg_start_decompress() will do everything needed before data
-output can begin.  In this case jpeg_start_decompress() may take quite a while
-to complete.  With a single-scan (non progressive) JPEG file and default
-decompression parameters, this will not happen; jpeg_start_decompress() will
-return quickly.
-
-After this call, the final output image dimensions, including any requested
-scaling, are available in the JPEG object; so is the selected colormap, if
-colormapped output has been requested.  Useful fields include
-
-	output_width		image width and height, as scaled
-	output_height
-	out_color_components	# of color components in out_color_space
-	output_components	# of color components returned per pixel
-	colormap		the selected colormap, if any
-	actual_number_of_colors		number of entries in colormap
-
-output_components is 1 (a colormap index) when quantizing colors; otherwise it
-equals out_color_components.  It is the number of JSAMPLE values that will be
-emitted per pixel in the output arrays.
-
-Typically you will need to allocate data buffers to hold the incoming image.
-You will need output_width * output_components JSAMPLEs per scanline in your
-output buffer, and a total of output_height scanlines will be returned.
-
-Note: if you are using the JPEG library's internal memory manager to allocate
-data buffers (as djpeg does), then the manager's protocol requires that you
-request large buffers *before* calling jpeg_start_decompress().  This is a
-little tricky since the output_XXX fields are not normally valid then.  You
-can make them valid by calling jpeg_calc_output_dimensions() after setting the
-relevant parameters (scaling, output color space, and quantization flag).
-
-
-6. while (scan lines remain to be read)
-	jpeg_read_scanlines(...);
-
-Now you can read the decompressed image data by calling jpeg_read_scanlines()
-one or more times.  At each call, you pass in the maximum number of scanlines
-to be read (ie, the height of your working buffer); jpeg_read_scanlines()
-will return up to that many lines.  The return value is the number of lines
-actually read.  The format of the returned data is discussed under "Data
-formats", above.  Don't forget that grayscale and color JPEGs will return
-different data formats!
-
-Image data is returned in top-to-bottom scanline order.  If you must write
-out the image in bottom-to-top order, you can use the JPEG library's virtual
-array mechanism to invert the data efficiently.  Examples of this can be
-found in the sample application djpeg.
-
-The library maintains a count of the number of scanlines returned so far
-in the output_scanline field of the JPEG object.  Usually you can just use
-this variable as the loop counter, so that the loop test looks like
-"while (cinfo.output_scanline < cinfo.output_height)".  (Note that the test
-should NOT be against image_height, unless you never use scaling.  The
-image_height field is the height of the original unscaled image.)
-The return value always equals the change in the value of output_scanline.
-
-If you don't use a suspending data source, it is safe to assume that
-jpeg_read_scanlines() reads at least one scanline per call, until the
-bottom of the image has been reached.
-
-If you use a buffer larger than one scanline, it is NOT safe to assume that
-jpeg_read_scanlines() fills it.  (The current implementation returns only a
-few scanlines per call, no matter how large a buffer you pass.)  So you must
-always provide a loop that calls jpeg_read_scanlines() repeatedly until the
-whole image has been read.
-
-
-7. jpeg_finish_decompress(...);
-
-After all the image data has been read, call jpeg_finish_decompress() to
-complete the decompression cycle.  This causes working memory associated
-with the JPEG object to be released.
-
-Typical code:
-
-	jpeg_finish_decompress(&cinfo);
-
-If using the stdio source manager, don't forget to close the source stdio
-stream if necessary.
-
-It is an error to call jpeg_finish_decompress() before reading the correct
-total number of scanlines.  If you wish to abort decompression, call
-jpeg_abort() as discussed below.
-
-After completing a decompression cycle, you may dispose of the JPEG object as
-discussed next, or you may use it to decompress another image.  In that case
-return to step 2 or 3 as appropriate.  If you do not change the source
-manager, the next image will be read from the same source.
-
-
-8. Release the JPEG decompression object.
-
-When you are done with a JPEG decompression object, destroy it by calling
-jpeg_destroy_decompress() or jpeg_destroy().  The previous discussion of
-destroying compression objects applies here too.
-
-Typical code:
-
-	jpeg_destroy_decompress(&cinfo);
-
-
-9. Aborting.
-
-You can abort a decompression cycle by calling jpeg_destroy_decompress() or
-jpeg_destroy() if you don't need the JPEG object any more, or
-jpeg_abort_decompress() or jpeg_abort() if you want to reuse the object.
-The previous discussion of aborting compression cycles applies here too.
-
-
-Mechanics of usage: include files, linking, etc
------------------------------------------------
-
-Applications using the JPEG library should include the header file jpeglib.h
-to obtain declarations of data types and routines.  Before including
-jpeglib.h, include system headers that define at least the typedefs FILE and
-size_t.  On ANSI-conforming systems, including <stdio.h> is sufficient; on
-older Unix systems, you may need <sys/types.h> to define size_t.
-
-If the application needs to refer to individual JPEG library error codes, also
-include jerror.h to define those symbols.
-
-jpeglib.h indirectly includes the files jconfig.h and jmorecfg.h.  If you are
-installing the JPEG header files in a system directory, you will want to
-install all four files: jpeglib.h, jerror.h, jconfig.h, jmorecfg.h.
-
-The most convenient way to include the JPEG code into your executable program
-is to prepare a library file ("libjpeg.a", or a corresponding name on non-Unix
-machines) and reference it at your link step.  If you use only half of the
-library (only compression or only decompression), only that much code will be
-included from the library, unless your linker is hopelessly brain-damaged.
-The supplied makefiles build libjpeg.a automatically (see install.doc).
-
-While you can build the JPEG library as a shared library if the whim strikes
-you, we don't really recommend it.  The trouble with shared libraries is that
-at some point you'll probably try to substitute a new version of the library
-without recompiling the calling applications.  That generally doesn't work
-because the parameter struct declarations usually change with each new
-version.  In other words, the library's API is *not* guaranteed binary
-compatible across versions; we only try to ensure source-code compatibility.
-(In hindsight, it might have been smarter to hide the parameter structs from
-applications and introduce a ton of access functions instead.  Too late now,
-however.)
-
-On some systems your application may need to set up a signal handler to ensure
-that temporary files are deleted if the program is interrupted.  This is most
-critical if you are on MS-DOS and use the jmemdos.c memory manager back end;
-it will try to grab extended memory for temp files, and that space will NOT be
-freed automatically.  See cjpeg.c or djpeg.c for an example signal handler.
-
-It may be worth pointing out that the core JPEG library does not actually
-require the stdio library: only the default source/destination managers and
-error handler need it.  You can use the library in a stdio-less environment
-if you replace those modules and use jmemnobs.c (or another memory manager of
-your own devising).  More info about the minimum system library requirements
-may be found in jinclude.h.
-
-
-ADVANCED FEATURES
-=================
-
-Compression parameter selection
--------------------------------
-
-This section describes all the optional parameters you can set for JPEG
-compression, as well as the "helper" routines provided to assist in this
-task.  Proper setting of some parameters requires detailed understanding
-of the JPEG standard; if you don't know what a parameter is for, it's best
-not to mess with it!  See REFERENCES in the README file for pointers to
-more info about JPEG.
-
-It's a good idea to call jpeg_set_defaults() first, even if you plan to set
-all the parameters; that way your code is more likely to work with future JPEG
-libraries that have additional parameters.  For the same reason, we recommend
-you use a helper routine where one is provided, in preference to twiddling
-cinfo fields directly.
-
-The helper routines are:
-
-jpeg_set_defaults (j_compress_ptr cinfo)
-	This routine sets all JPEG parameters to reasonable defaults, using
-	only the input image's color space (field in_color_space, which must
-	already be set in cinfo).  Many applications will only need to use
-	this routine and perhaps jpeg_set_quality().
-
-jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
-	Sets the JPEG file's colorspace (field jpeg_color_space) as specified,
-	and sets other color-space-dependent parameters appropriately.  See
-	"Special color spaces", below, before using this.  A large number of
-	parameters, including all per-component parameters, are set by this
-	routine; if you want to twiddle individual parameters you should call
-	jpeg_set_colorspace() before rather than after.
-
-jpeg_default_colorspace (j_compress_ptr cinfo)
-	Selects an appropriate JPEG colorspace based on cinfo->in_color_space,
-	and calls jpeg_set_colorspace().  This is actually a subroutine of
-	jpeg_set_defaults().  It's broken out in case you want to change
-	just the colorspace-dependent JPEG parameters.
-
-jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
-	Constructs JPEG quantization tables appropriate for the indicated
-	quality setting.  The quality value is expressed on the 0..100 scale
-	recommended by IJG (cjpeg's "-quality" switch uses this routine).
-	Note that the exact mapping from quality values to tables may change
-	in future IJG releases as more is learned about DCT quantization.
-	If the force_baseline parameter is TRUE, then the quantization table
-	entries are constrained to the range 1..255 for full JPEG baseline
-	compatibility.  In the current implementation, this only makes a
-	difference for quality settings below 25, and it effectively prevents
-	very small/low quality files from being generated.  The IJG decoder
-	is capable of reading the non-baseline files generated at low quality
-	settings when force_baseline is FALSE, but other decoders may not be.
-
-jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-			 boolean force_baseline)
-	Same as jpeg_set_quality() except that the generated tables are the
-	sample tables given in the JPEC spec section K.1, multiplied by the
-	specified scale factor (which is expressed as a percentage; thus
-	scale_factor = 100 reproduces the spec's tables).  Note that larger
-	scale factors give lower quality.  This entry point is useful for
-	conforming to the Adobe PostScript DCT conventions, but we do not
-	recommend linear scaling as a user-visible quality scale otherwise.
-	force_baseline again constrains the computed table entries to 1..255.
-
-int jpeg_quality_scaling (int quality)
-	Converts a value on the IJG-recommended quality scale to a linear
-	scaling percentage.  Note that this routine may change or go away
-	in future releases --- IJG may choose to adopt a scaling method that
-	can't be expressed as a simple scalar multiplier, in which case the
-	premise of this routine collapses.  Caveat user.
-
-jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-		      const unsigned int *basic_table,
-		      int scale_factor, boolean force_baseline)
-	Allows an arbitrary quantization table to be created.  which_tbl
-	indicates which table slot to fill.  basic_table points to an array
-	of 64 unsigned ints given in normal array order.  These values are
-	multiplied by scale_factor/100 and then clamped to the range 1..65535
-	(or to 1..255 if force_baseline is TRUE).
-	CAUTION: prior to library version 6a, jpeg_add_quant_table expected
-	the basic table to be given in JPEG zigzag order.  If you need to
-	write code that works with either older or newer versions of this
-	routine, you must check the library version number.  Something like
-	"#if JPEG_LIB_VERSION >= 61" is the right test.
-
-jpeg_simple_progression (j_compress_ptr cinfo)
-	Generates a default scan script for writing a progressive-JPEG file.
-	This is the recommended method of creating a progressive file,
-	unless you want to make a custom scan sequence.  You must ensure that
-	the JPEG color space is set correctly before calling this routine.
-
-
-Compression parameters (cinfo fields) include:
-
-J_DCT_METHOD dct_method
-	Selects the algorithm used for the DCT step.  Choices are:
-		JDCT_ISLOW: slow but accurate integer algorithm
-		JDCT_IFAST: faster, less accurate integer method
-		JDCT_FLOAT: floating-point method
-		JDCT_DEFAULT: default method (normally JDCT_ISLOW)
-		JDCT_FASTEST: fastest method (normally JDCT_IFAST)
-	The FLOAT method is very slightly more accurate than the ISLOW method,
-	but may give different results on different machines due to varying
-	roundoff behavior.  The integer methods should give the same results
-	on all machines.  On machines with sufficiently fast FP hardware, the
-	floating-point method may also be the fastest.  The IFAST method is
-	considerably less accurate than the other two; its use is not
-	recommended if high quality is a concern.  JDCT_DEFAULT and
-	JDCT_FASTEST are macros configurable by each installation.
-
-J_COLOR_SPACE jpeg_color_space
-int num_components
-	The JPEG color space and corresponding number of components; see
-	"Special color spaces", below, for more info.  We recommend using
-	jpeg_set_color_space() if you want to change these.
-
-boolean optimize_coding
-	TRUE causes the compressor to compute optimal Huffman coding tables
-	for the image.  This requires an extra pass over the data and
-	therefore costs a good deal of space and time.  The default is
-	FALSE, which tells the compressor to use the supplied or default
-	Huffman tables.  In most cases optimal tables save only a few percent
-	of file size compared to the default tables.  Note that when this is
-	TRUE, you need not supply Huffman tables at all, and any you do
-	supply will be overwritten.
-
-unsigned int restart_interval
-int restart_in_rows
-	To emit restart markers in the JPEG file, set one of these nonzero.
-	Set restart_interval to specify the exact interval in MCU blocks.
-	Set restart_in_rows to specify the interval in MCU rows.  (If
-	restart_in_rows is not 0, then restart_interval is set after the
-	image width in MCUs is computed.)  Defaults are zero (no restarts).
-	One restart marker per MCU row is often a good choice.
-	NOTE: the overhead of restart markers is higher in grayscale JPEG
-	files than in color files, and MUCH higher in progressive JPEGs.
-	If you use restarts, you may want to use larger intervals in those
-	cases.
-
-const jpeg_scan_info * scan_info
-int num_scans
-	By default, scan_info is NULL; this causes the compressor to write a
-	single-scan sequential JPEG file.  If not NULL, scan_info points to
-	an array of scan definition records of length num_scans.  The
-	compressor will then write a JPEG file having one scan for each scan
-	definition record.  This is used to generate noninterleaved or
-	progressive JPEG files.  The library checks that the scan array
-	defines a valid JPEG scan sequence.  (jpeg_simple_progression creates
-	a suitable scan definition array for progressive JPEG.)  This is
-	discussed further under "Progressive JPEG support".
-
-int smoothing_factor
-	If non-zero, the input image is smoothed; the value should be 1 for
-	minimal smoothing to 100 for maximum smoothing.  Consult jcsample.c
-	for details of the smoothing algorithm.  The default is zero.
-
-boolean write_JFIF_header
-	If TRUE, a JFIF APP0 marker is emitted.  jpeg_set_defaults() and
-	jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space
-	(ie, YCbCr or grayscale) is selected, otherwise FALSE.
-
-UINT8 JFIF_major_version
-UINT8 JFIF_minor_version
-	The version number to be written into the JFIF marker.
-	jpeg_set_defaults() initializes the version to 1.01 (major=minor=1).
-	You should set it to 1.02 (major=1, minor=2) if you plan to write
-	any JFIF 1.02 extension markers.
-
-UINT8 density_unit
-UINT16 X_density
-UINT16 Y_density
-	The resolution information to be written into the JFIF marker;
-	not used otherwise.  density_unit may be 0 for unknown,
-	1 for dots/inch, or 2 for dots/cm.  The default values are 0,1,1
-	indicating square pixels of unknown size.
-
-boolean write_Adobe_marker
-	If TRUE, an Adobe APP14 marker is emitted.  jpeg_set_defaults() and
-	jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK,
-	or YCCK is selected, otherwise FALSE.  It is generally a bad idea
-	to set both write_JFIF_header and write_Adobe_marker.  In fact,
-	you probably shouldn't change the default settings at all --- the
-	default behavior ensures that the JPEG file's color space can be
-	recognized by the decoder.
-
-JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS]
-	Pointers to coefficient quantization tables, one per table slot,
-	or NULL if no table is defined for a slot.  Usually these should
-	be set via one of the above helper routines; jpeg_add_quant_table()
-	is general enough to define any quantization table.  The other
-	routines will set up table slot 0 for luminance quality and table
-	slot 1 for chrominance.
-
-JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
-JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
-	Pointers to Huffman coding tables, one per table slot, or NULL if
-	no table is defined for a slot.  Slots 0 and 1 are filled with the
-	JPEG sample tables by jpeg_set_defaults().  If you need to allocate
-	more table structures, jpeg_alloc_huff_table() may be used.
-	Note that optimal Huffman tables can be computed for an image
-	by setting optimize_coding, as discussed above; there's seldom
-	any need to mess with providing your own Huffman tables.
-
-There are some additional cinfo fields which are not documented here
-because you currently can't change them; for example, you can't set
-arith_code TRUE because arithmetic coding is unsupported.
-
-
-Per-component parameters are stored in the struct cinfo.comp_info[i] for
-component number i.  Note that components here refer to components of the
-JPEG color space, *not* the source image color space.  A suitably large
-comp_info[] array is allocated by jpeg_set_defaults(); if you choose not
-to use that routine, it's up to you to allocate the array.
-
-int component_id
-	The one-byte identifier code to be recorded in the JPEG file for
-	this component.  For the standard color spaces, we recommend you
-	leave the default values alone.
-
-int h_samp_factor
-int v_samp_factor
-	Horizontal and vertical sampling factors for the component; must
-	be 1..4 according to the JPEG standard.  Note that larger sampling
-	factors indicate a higher-resolution component; many people find
-	this behavior quite unintuitive.  The default values are 2,2 for
-	luminance components and 1,1 for chrominance components, except
-	for grayscale where 1,1 is used.
-
-int quant_tbl_no
-	Quantization table number for component.  The default value is
-	0 for luminance components and 1 for chrominance components.
-
-int dc_tbl_no
-int ac_tbl_no
-	DC and AC entropy coding table numbers.  The default values are
-	0 for luminance components and 1 for chrominance components.
-
-int component_index
-	Must equal the component's index in comp_info[].  (Beginning in
-	release v6, the compressor library will fill this in automatically;
-	you don't have to.)
-
-
-Decompression parameter selection
----------------------------------
-
-Decompression parameter selection is somewhat simpler than compression
-parameter selection, since all of the JPEG internal parameters are
-recorded in the source file and need not be supplied by the application.
-(Unless you are working with abbreviated files, in which case see
-"Abbreviated datastreams", below.)  Decompression parameters control
-the postprocessing done on the image to deliver it in a format suitable
-for the application's use.  Many of the parameters control speed/quality
-tradeoffs, in which faster decompression may be obtained at the price of
-a poorer-quality image.  The defaults select the highest quality (slowest)
-processing.
-
-The following fields in the JPEG object are set by jpeg_read_header() and
-may be useful to the application in choosing decompression parameters:
-
-JDIMENSION image_width			Width and height of image
-JDIMENSION image_height
-int num_components			Number of color components
-J_COLOR_SPACE jpeg_color_space		Colorspace of image
-boolean saw_JFIF_marker			TRUE if a JFIF APP0 marker was seen
-  UINT8 JFIF_major_version		Version information from JFIF marker
-  UINT8 JFIF_minor_version
-  UINT8 density_unit			Resolution data from JFIF marker
-  UINT16 X_density
-  UINT16 Y_density
-boolean saw_Adobe_marker		TRUE if an Adobe APP14 marker was seen
-  UINT8 Adobe_transform			Color transform code from Adobe marker
-
-The JPEG color space, unfortunately, is something of a guess since the JPEG
-standard proper does not provide a way to record it.  In practice most files
-adhere to the JFIF or Adobe conventions, and the decoder will recognize these
-correctly.  See "Special color spaces", below, for more info.
-
-
-The decompression parameters that determine the basic properties of the
-returned image are:
-
-J_COLOR_SPACE out_color_space
-	Output color space.  jpeg_read_header() sets an appropriate default
-	based on jpeg_color_space; typically it will be RGB or grayscale.
-	The application can change this field to request output in a different
-	colorspace.  For example, set it to JCS_GRAYSCALE to get grayscale
-	output from a color file.  (This is useful for previewing: grayscale
-	output is faster than full color since the color components need not
-	be processed.)  Note that not all possible color space transforms are
-	currently implemented; you may need to extend jdcolor.c if you want an
-	unusual conversion.
-
-unsigned int scale_num, scale_denom
-	Scale the image by the fraction scale_num/scale_denom.  Default is
-	1/1, or no scaling.  Currently, the only supported scaling ratios
-	are 1/1, 1/2, 1/4, and 1/8.  (The library design allows for arbitrary
-	scaling ratios but this is not likely to be implemented any time soon.)
-	Smaller scaling ratios permit significantly faster decoding since
-	fewer pixels need be processed and a simpler IDCT method can be used.
-
-boolean quantize_colors
-	If set TRUE, colormapped output will be delivered.  Default is FALSE,
-	meaning that full-color output will be delivered.
-
-The next three parameters are relevant only if quantize_colors is TRUE.
-
-int desired_number_of_colors
-	Maximum number of colors to use in generating a library-supplied color
-	map (the actual number of colors is returned in a different field).
-	Default 256.  Ignored when the application supplies its own color map.
-
-boolean two_pass_quantize
-	If TRUE, an extra pass over the image is made to select a custom color
-	map for the image.  This usually looks a lot better than the one-size-
-	fits-all colormap that is used otherwise.  Default is TRUE.  Ignored
-	when the application supplies its own color map.
-
-J_DITHER_MODE dither_mode
-	Selects color dithering method.  Supported values are:
-		JDITHER_NONE	no dithering: fast, very low quality
-		JDITHER_ORDERED	ordered dither: moderate speed and quality
-		JDITHER_FS	Floyd-Steinberg dither: slow, high quality
-	Default is JDITHER_FS.  (At present, ordered dither is implemented
-	only in the single-pass, standard-colormap case.  If you ask for
-	ordered dither when two_pass_quantize is TRUE or when you supply
-	an external color map, you'll get F-S dithering.)
-
-When quantize_colors is TRUE, the target color map is described by the next
-two fields.  colormap is set to NULL by jpeg_read_header().  The application
-can supply a color map by setting colormap non-NULL and setting
-actual_number_of_colors to the map size.  Otherwise, jpeg_start_decompress()
-selects a suitable color map and sets these two fields itself.
-[Implementation restriction: at present, an externally supplied colormap is
-only accepted for 3-component output color spaces.]
-
-JSAMPARRAY colormap
-	The color map, represented as a 2-D pixel array of out_color_components
-	rows and actual_number_of_colors columns.  Ignored if not quantizing.
-	CAUTION: if the JPEG library creates its own colormap, the storage
-	pointed to by this field is released by jpeg_finish_decompress().
-	Copy the colormap somewhere else first, if you want to save it.
-
-int actual_number_of_colors
-	The number of colors in the color map.
-
-Additional decompression parameters that the application may set include:
-
-J_DCT_METHOD dct_method
-	Selects the algorithm used for the DCT step.  Choices are the same
-	as described above for compression.
-
-boolean do_fancy_upsampling
-	If TRUE, do careful upsampling of chroma components.  If FALSE,
-	a faster but sloppier method is used.  Default is TRUE.  The visual
-	impact of the sloppier method is often very small.
-
-boolean do_block_smoothing
-	If TRUE, interblock smoothing is applied in early stages of decoding
-	progressive JPEG files; if FALSE, not.  Default is TRUE.  Early
-	progression stages look "fuzzy" with smoothing, "blocky" without.
-	In any case, block smoothing ceases to be applied after the first few
-	AC coefficients are known to full accuracy, so it is relevant only
-	when using buffered-image mode for progressive images.
-
-boolean enable_1pass_quant
-boolean enable_external_quant
-boolean enable_2pass_quant
-	These are significant only in buffered-image mode, which is
-	described in its own section below.
-
-
-The output image dimensions are given by the following fields.  These are
-computed from the source image dimensions and the decompression parameters
-by jpeg_start_decompress().  You can also call jpeg_calc_output_dimensions()
-to obtain the values that will result from the current parameter settings.
-This can be useful if you are trying to pick a scaling ratio that will get
-close to a desired target size.  It's also important if you are using the
-JPEG library's memory manager to allocate output buffer space, because you
-are supposed to request such buffers *before* jpeg_start_decompress().
-
-JDIMENSION output_width		Actual dimensions of output image.
-JDIMENSION output_height
-int out_color_components	Number of color components in out_color_space.
-int output_components		Number of color components returned.
-int rec_outbuf_height		Recommended height of scanline buffer.
-
-When quantizing colors, output_components is 1, indicating a single color map
-index per pixel.  Otherwise it equals out_color_components.  The output arrays
-are required to be output_width * output_components JSAMPLEs wide.
-
-rec_outbuf_height is the recommended minimum height (in scanlines) of the
-buffer passed to jpeg_read_scanlines().  If the buffer is smaller, the
-library will still work, but time will be wasted due to unnecessary data
-copying.  In high-quality modes, rec_outbuf_height is always 1, but some
-faster, lower-quality modes set it to larger values (typically 2 to 4).
-If you are going to ask for a high-speed processing mode, you may as well
-go to the trouble of honoring rec_outbuf_height so as to avoid data copying.
-(An output buffer larger than rec_outbuf_height lines is OK, but won't
-provide any material speed improvement over that height.)
-
-
-Special color spaces
---------------------
-
-The JPEG standard itself is "color blind" and doesn't specify any particular
-color space.  It is customary to convert color data to a luminance/chrominance
-color space before compressing, since this permits greater compression.  The
-existing de-facto JPEG file format standards specify YCbCr or grayscale data
-(JFIF), or grayscale, RGB, YCbCr, CMYK, or YCCK (Adobe).  For special
-applications such as multispectral images, other color spaces can be used,
-but it must be understood that such files will be unportable.
-
-The JPEG library can handle the most common colorspace conversions (namely
-RGB <=> YCbCr and CMYK <=> YCCK).  It can also deal with data of an unknown
-color space, passing it through without conversion.  If you deal extensively
-with an unusual color space, you can easily extend the library to understand
-additional color spaces and perform appropriate conversions.
-
-For compression, the source data's color space is specified by field
-in_color_space.  This is transformed to the JPEG file's color space given
-by jpeg_color_space.  jpeg_set_defaults() chooses a reasonable JPEG color
-space depending on in_color_space, but you can override this by calling
-jpeg_set_colorspace().  Of course you must select a supported transformation.
-jccolor.c currently supports the following transformations:
-	RGB => YCbCr
-	RGB => GRAYSCALE
-	YCbCr => GRAYSCALE
-	CMYK => YCCK
-plus the null transforms: GRAYSCALE => GRAYSCALE, RGB => RGB,
-YCbCr => YCbCr, CMYK => CMYK, YCCK => YCCK, and UNKNOWN => UNKNOWN.
-
-The de-facto file format standards (JFIF and Adobe) specify APPn markers that
-indicate the color space of the JPEG file.  It is important to ensure that
-these are written correctly, or omitted if the JPEG file's color space is not
-one of the ones supported by the de-facto standards.  jpeg_set_colorspace()
-will set the compression parameters to include or omit the APPn markers
-properly, so long as it is told the truth about the JPEG color space.
-For example, if you are writing some random 3-component color space without
-conversion, don't try to fake out the library by setting in_color_space and
-jpeg_color_space to JCS_YCbCr; use JCS_UNKNOWN.  You may want to write an
-APPn marker of your own devising to identify the colorspace --- see "Special
-markers", below.
-
-When told that the color space is UNKNOWN, the library will default to using
-luminance-quality compression parameters for all color components.  You may
-well want to change these parameters.  See the source code for
-jpeg_set_colorspace(), in jcparam.c, for details.
-
-For decompression, the JPEG file's color space is given in jpeg_color_space,
-and this is transformed to the output color space out_color_space.
-jpeg_read_header's setting of jpeg_color_space can be relied on if the file
-conforms to JFIF or Adobe conventions, but otherwise it is no better than a
-guess.  If you know the JPEG file's color space for certain, you can override
-jpeg_read_header's guess by setting jpeg_color_space.  jpeg_read_header also
-selects a default output color space based on (its guess of) jpeg_color_space;
-set out_color_space to override this.  Again, you must select a supported
-transformation.  jdcolor.c currently supports
-	YCbCr => GRAYSCALE
-	YCbCr => RGB
-	GRAYSCALE => RGB
-	YCCK => CMYK
-as well as the null transforms.  (Since GRAYSCALE=>RGB is provided, an
-application can force grayscale JPEGs to look like color JPEGs if it only
-wants to handle one case.)
-
-The two-pass color quantizer, jquant2.c, is specialized to handle RGB data
-(it weights distances appropriately for RGB colors).  You'll need to modify
-the code if you want to use it for non-RGB output color spaces.  Note that
-jquant2.c is used to map to an application-supplied colormap as well as for
-the normal two-pass colormap selection process.
-
-CAUTION: it appears that Adobe Photoshop writes inverted data in CMYK JPEG
-files: 0 represents 100% ink coverage, rather than 0% ink as you'd expect.
-This is arguably a bug in Photoshop, but if you need to work with Photoshop
-CMYK files, you will have to deal with it in your application.  We cannot
-"fix" this in the library by inverting the data during the CMYK<=>YCCK
-transform, because that would break other applications, notably Ghostscript.
-Photoshop versions prior to 3.0 write EPS files containing JPEG-encoded CMYK
-data in the same inverted-YCCK representation used in bare JPEG files, but
-the surrounding PostScript code performs an inversion using the PS image
-operator.  I am told that Photoshop 3.0 will write uninverted YCCK in
-EPS/JPEG files, and will omit the PS-level inversion.  (But the data
-polarity used in bare JPEG files will not change in 3.0.)  In either case,
-the JPEG library must not invert the data itself, or else Ghostscript would
-read these EPS files incorrectly.
-
-
-Error handling
---------------
-
-When the default error handler is used, any error detected inside the JPEG
-routines will cause a message to be printed on stderr, followed by exit().
-You can supply your own error handling routines to override this behavior
-and to control the treatment of nonfatal warnings and trace/debug messages.
-The file example.c illustrates the most common case, which is to have the
-application regain control after an error rather than exiting.
-
-The JPEG library never writes any message directly; it always goes through
-the error handling routines.  Three classes of messages are recognized:
-  * Fatal errors: the library cannot continue.
-  * Warnings: the library can continue, but the data is corrupt, and a
-    damaged output image is likely to result.
-  * Trace/informational messages.  These come with a trace level indicating
-    the importance of the message; you can control the verbosity of the
-    program by adjusting the maximum trace level that will be displayed.
-
-You may, if you wish, simply replace the entire JPEG error handling module
-(jerror.c) with your own code.  However, you can avoid code duplication by
-only replacing some of the routines depending on the behavior you need.
-This is accomplished by calling jpeg_std_error() as usual, but then overriding
-some of the method pointers in the jpeg_error_mgr struct, as illustrated by
-example.c.
-
-All of the error handling routines will receive a pointer to the JPEG object
-(a j_common_ptr which points to either a jpeg_compress_struct or a
-jpeg_decompress_struct; if you need to tell which, test the is_decompressor
-field).  This struct includes a pointer to the error manager struct in its
-"err" field.  Frequently, custom error handler routines will need to access
-additional data which is not known to the JPEG library or the standard error
-handler.  The most convenient way to do this is to embed either the JPEG
-object or the jpeg_error_mgr struct in a larger structure that contains
-additional fields; then casting the passed pointer provides access to the
-additional fields.  Again, see example.c for one way to do it.  (Beginning
-with IJG version 6b, there is also a void pointer "client_data" in each
-JPEG object, which the application can also use to find related data.
-The library does not touch client_data at all.)
-
-The individual methods that you might wish to override are:
-
-error_exit (j_common_ptr cinfo)
-	Receives control for a fatal error.  Information sufficient to
-	generate the error message has been stored in cinfo->err; call
-	output_message to display it.  Control must NOT return to the caller;
-	generally this routine will exit() or longjmp() somewhere.
-	Typically you would override this routine to get rid of the exit()
-	default behavior.  Note that if you continue processing, you should
-	clean up the JPEG object with jpeg_abort() or jpeg_destroy().
-
-output_message (j_common_ptr cinfo)
-	Actual output of any JPEG message.  Override this to send messages
-	somewhere other than stderr.  Note that this method does not know
-	how to generate a message, only where to send it.
-
-format_message (j_common_ptr cinfo, char * buffer)
-	Constructs a readable error message string based on the error info
-	stored in cinfo->err.  This method is called by output_message.  Few
-	applications should need to override this method.  One possible
-	reason for doing so is to implement dynamic switching of error message
-	language.
-
-emit_message (j_common_ptr cinfo, int msg_level)
-	Decide whether or not to emit a warning or trace message; if so,
-	calls output_message.  The main reason for overriding this method
-	would be to abort on warnings.  msg_level is -1 for warnings,
-	0 and up for trace messages.
-
-Only error_exit() and emit_message() are called from the rest of the JPEG
-library; the other two are internal to the error handler.
-
-The actual message texts are stored in an array of strings which is pointed to
-by the field err->jpeg_message_table.  The messages are numbered from 0 to
-err->last_jpeg_message, and it is these code numbers that are used in the
-JPEG library code.  You could replace the message texts (for instance, with
-messages in French or German) by changing the message table pointer.  See
-jerror.h for the default texts.  CAUTION: this table will almost certainly
-change or grow from one library version to the next.
-
-It may be useful for an application to add its own message texts that are
-handled by the same mechanism.  The error handler supports a second "add-on"
-message table for this purpose.  To define an addon table, set the pointer
-err->addon_message_table and the message numbers err->first_addon_message and
-err->last_addon_message.  If you number the addon messages beginning at 1000
-or so, you won't have to worry about conflicts with the library's built-in
-messages.  See the sample applications cjpeg/djpeg for an example of using
-addon messages (the addon messages are defined in cderror.h).
-
-Actual invocation of the error handler is done via macros defined in jerror.h:
-	ERREXITn(...)	for fatal errors
-	WARNMSn(...)	for corrupt-data warnings
-	TRACEMSn(...)	for trace and informational messages.
-These macros store the message code and any additional parameters into the
-error handler struct, then invoke the error_exit() or emit_message() method.
-The variants of each macro are for varying numbers of additional parameters.
-The additional parameters are inserted into the generated message using
-standard printf() format codes.
-
-See jerror.h and jerror.c for further details.
-
-
-Compressed data handling (source and destination managers)
-----------------------------------------------------------
-
-The JPEG compression library sends its compressed data to a "destination
-manager" module.  The default destination manager just writes the data to a
-stdio stream, but you can provide your own manager to do something else.
-Similarly, the decompression library calls a "source manager" to obtain the
-compressed data; you can provide your own source manager if you want the data
-to come from somewhere other than a stdio stream.
-
-In both cases, compressed data is processed a bufferload at a time: the
-destination or source manager provides a work buffer, and the library invokes
-the manager only when the buffer is filled or emptied.  (You could define a
-one-character buffer to force the manager to be invoked for each byte, but
-that would be rather inefficient.)  The buffer's size and location are
-controlled by the manager, not by the library.  For example, if you desired to
-decompress a JPEG datastream that was all in memory, you could just make the
-buffer pointer and length point to the original data in memory.  Then the
-buffer-reload procedure would be invoked only if the decompressor ran off the
-end of the datastream, which would indicate an erroneous datastream.
-
-The work buffer is defined as an array of datatype JOCTET, which is generally
-"char" or "unsigned char".  On a machine where char is not exactly 8 bits
-wide, you must define JOCTET as a wider data type and then modify the data
-source and destination modules to transcribe the work arrays into 8-bit units
-on external storage.
-
-A data destination manager struct contains a pointer and count defining the
-next byte to write in the work buffer and the remaining free space:
-
-	JOCTET * next_output_byte;  /* => next byte to write in buffer */
-	size_t free_in_buffer;      /* # of byte spaces remaining in buffer */
-
-The library increments the pointer and decrements the count until the buffer
-is filled.  The manager's empty_output_buffer method must reset the pointer
-and count.  The manager is expected to remember the buffer's starting address
-and total size in private fields not visible to the library.
-
-A data destination manager provides three methods:
-
-init_destination (j_compress_ptr cinfo)
-	Initialize destination.  This is called by jpeg_start_compress()
-	before any data is actually written.  It must initialize
-	next_output_byte and free_in_buffer.  free_in_buffer must be
-	initialized to a positive value.
-
-empty_output_buffer (j_compress_ptr cinfo)
-	This is called whenever the buffer has filled (free_in_buffer
-	reaches zero).  In typical applications, it should write out the
-	*entire* buffer (use the saved start address and buffer length;
-	ignore the current state of next_output_byte and free_in_buffer).
-	Then reset the pointer & count to the start of the buffer, and
-	return TRUE indicating that the buffer has been dumped.
-	free_in_buffer must be set to a positive value when TRUE is
-	returned.  A FALSE return should only be used when I/O suspension is
-	desired (this operating mode is discussed in the next section).
-
-term_destination (j_compress_ptr cinfo)
-	Terminate destination --- called by jpeg_finish_compress() after all
-	data has been written.  In most applications, this must flush any
-	data remaining in the buffer.  Use either next_output_byte or
-	free_in_buffer to determine how much data is in the buffer.
-
-term_destination() is NOT called by jpeg_abort() or jpeg_destroy().  If you
-want the destination manager to be cleaned up during an abort, you must do it
-yourself.
-
-You will also need code to create a jpeg_destination_mgr struct, fill in its
-method pointers, and insert a pointer to the struct into the "dest" field of
-the JPEG compression object.  This can be done in-line in your setup code if
-you like, but it's probably cleaner to provide a separate routine similar to
-the jpeg_stdio_dest() routine of the supplied destination manager.
-
-Decompression source managers follow a parallel design, but with some
-additional frammishes.  The source manager struct contains a pointer and count
-defining the next byte to read from the work buffer and the number of bytes
-remaining:
-
-	const JOCTET * next_input_byte; /* => next byte to read from buffer */
-	size_t bytes_in_buffer;         /* # of bytes remaining in buffer */
-
-The library increments the pointer and decrements the count until the buffer
-is emptied.  The manager's fill_input_buffer method must reset the pointer and
-count.  In most applications, the manager must remember the buffer's starting
-address and total size in private fields not visible to the library.
-
-A data source manager provides five methods:
-
-init_source (j_decompress_ptr cinfo)
-	Initialize source.  This is called by jpeg_read_header() before any
-	data is actually read.  Unlike init_destination(), it may leave
-	bytes_in_buffer set to 0 (in which case a fill_input_buffer() call
-	will occur immediately).
-
-fill_input_buffer (j_decompress_ptr cinfo)
-	This is called whenever bytes_in_buffer has reached zero and more
-	data is wanted.  In typical applications, it should read fresh data
-	into the buffer (ignoring the current state of next_input_byte and
-	bytes_in_buffer), reset the pointer & count to the start of the
-	buffer, and return TRUE indicating that the buffer has been reloaded.
-	It is not necessary to fill the buffer entirely, only to obtain at
-	least one more byte.  bytes_in_buffer MUST be set to a positive value
-	if TRUE is returned.  A FALSE return should only be used when I/O
-	suspension is desired (this mode is discussed in the next section).
-
-skip_input_data (j_decompress_ptr cinfo, long num_bytes)
-	Skip num_bytes worth of data.  The buffer pointer and count should
-	be advanced over num_bytes input bytes, refilling the buffer as
-	needed.  This is used to skip over a potentially large amount of
-	uninteresting data (such as an APPn marker).  In some applications
-	it may be possible to optimize away the reading of the skipped data,
-	but it's not clear that being smart is worth much trouble; large
-	skips are uncommon.  bytes_in_buffer may be zero on return.
-	A zero or negative skip count should be treated as a no-op.
-
-resync_to_restart (j_decompress_ptr cinfo, int desired)
-	This routine is called only when the decompressor has failed to find
-	a restart (RSTn) marker where one is expected.  Its mission is to
-	find a suitable point for resuming decompression.  For most
-	applications, we recommend that you just use the default resync
-	procedure, jpeg_resync_to_restart().  However, if you are able to back
-	up in the input data stream, or if you have a-priori knowledge about
-	the likely location of restart markers, you may be able to do better.
-	Read the read_restart_marker() and jpeg_resync_to_restart() routines
-	in jdmarker.c if you think you'd like to implement your own resync
-	procedure.
-
-term_source (j_decompress_ptr cinfo)
-	Terminate source --- called by jpeg_finish_decompress() after all
-	data has been read.  Often a no-op.
-
-For both fill_input_buffer() and skip_input_data(), there is no such thing
-as an EOF return.  If the end of the file has been reached, the routine has
-a choice of exiting via ERREXIT() or inserting fake data into the buffer.
-In most cases, generating a warning message and inserting a fake EOI marker
-is the best course of action --- this will allow the decompressor to output
-however much of the image is there.  In pathological cases, the decompressor
-may swallow the EOI and again demand data ... just keep feeding it fake EOIs.
-jdatasrc.c illustrates the recommended error recovery behavior.
-
-term_source() is NOT called by jpeg_abort() or jpeg_destroy().  If you want
-the source manager to be cleaned up during an abort, you must do it yourself.
-
-You will also need code to create a jpeg_source_mgr struct, fill in its method
-pointers, and insert a pointer to the struct into the "src" field of the JPEG
-decompression object.  This can be done in-line in your setup code if you
-like, but it's probably cleaner to provide a separate routine similar to the
-jpeg_stdio_src() routine of the supplied source manager.
-
-For more information, consult the stdio source and destination managers
-in jdatasrc.c and jdatadst.c.
-
-
-I/O suspension
---------------
-
-Some applications need to use the JPEG library as an incremental memory-to-
-memory filter: when the compressed data buffer is filled or emptied, they want
-control to return to the outer loop, rather than expecting that the buffer can
-be emptied or reloaded within the data source/destination manager subroutine.
-The library supports this need by providing an "I/O suspension" mode, which we
-describe in this section.
-
-The I/O suspension mode is not a panacea: nothing is guaranteed about the
-maximum amount of time spent in any one call to the library, so it will not
-eliminate response-time problems in single-threaded applications.  If you
-need guaranteed response time, we suggest you "bite the bullet" and implement
-a real multi-tasking capability.
-
-To use I/O suspension, cooperation is needed between the calling application
-and the data source or destination manager; you will always need a custom
-source/destination manager.  (Please read the previous section if you haven't
-already.)  The basic idea is that the empty_output_buffer() or
-fill_input_buffer() routine is a no-op, merely returning FALSE to indicate
-that it has done nothing.  Upon seeing this, the JPEG library suspends
-operation and returns to its caller.  The surrounding application is
-responsible for emptying or refilling the work buffer before calling the
-JPEG library again.
-
-Compression suspension:
-
-For compression suspension, use an empty_output_buffer() routine that returns
-FALSE; typically it will not do anything else.  This will cause the
-compressor to return to the caller of jpeg_write_scanlines(), with the return
-value indicating that not all the supplied scanlines have been accepted.
-The application must make more room in the output buffer, adjust the output
-buffer pointer/count appropriately, and then call jpeg_write_scanlines()
-again, pointing to the first unconsumed scanline.
-
-When forced to suspend, the compressor will backtrack to a convenient stopping
-point (usually the start of the current MCU); it will regenerate some output
-data when restarted.  Therefore, although empty_output_buffer() is only
-called when the buffer is filled, you should NOT write out the entire buffer
-after a suspension.  Write only the data up to the current position of
-next_output_byte/free_in_buffer.  The data beyond that point will be
-regenerated after resumption.
-
-Because of the backtracking behavior, a good-size output buffer is essential
-for efficiency; you don't want the compressor to suspend often.  (In fact, an
-overly small buffer could lead to infinite looping, if a single MCU required
-more data than would fit in the buffer.)  We recommend a buffer of at least
-several Kbytes.  You may want to insert explicit code to ensure that you don't
-call jpeg_write_scanlines() unless there is a reasonable amount of space in
-the output buffer; in other words, flush the buffer before trying to compress
-more data.
-
-The compressor does not allow suspension while it is trying to write JPEG
-markers at the beginning and end of the file.  This means that:
-  * At the beginning of a compression operation, there must be enough free
-    space in the output buffer to hold the header markers (typically 600 or
-    so bytes).  The recommended buffer size is bigger than this anyway, so
-    this is not a problem as long as you start with an empty buffer.  However,
-    this restriction might catch you if you insert large special markers, such
-    as a JFIF thumbnail image, without flushing the buffer afterwards.
-  * When you call jpeg_finish_compress(), there must be enough space in the
-    output buffer to emit any buffered data and the final EOI marker.  In the
-    current implementation, half a dozen bytes should suffice for this, but
-    for safety's sake we recommend ensuring that at least 100 bytes are free
-    before calling jpeg_finish_compress().
-
-A more significant restriction is that jpeg_finish_compress() cannot suspend.
-This means you cannot use suspension with multi-pass operating modes, namely
-Huffman code optimization and multiple-scan output.  Those modes write the
-whole file during jpeg_finish_compress(), which will certainly result in
-buffer overrun.  (Note that this restriction applies only to compression,
-not decompression.  The decompressor supports input suspension in all of its
-operating modes.)
-
-Decompression suspension:
-
-For decompression suspension, use a fill_input_buffer() routine that simply
-returns FALSE (except perhaps during error recovery, as discussed below).
-This will cause the decompressor to return to its caller with an indication
-that suspension has occurred.  This can happen at four places:
-  * jpeg_read_header(): will return JPEG_SUSPENDED.
-  * jpeg_start_decompress(): will return FALSE, rather than its usual TRUE.
-  * jpeg_read_scanlines(): will return the number of scanlines already
-	completed (possibly 0).
-  * jpeg_finish_decompress(): will return FALSE, rather than its usual TRUE.
-The surrounding application must recognize these cases, load more data into
-the input buffer, and repeat the call.  In the case of jpeg_read_scanlines(),
-increment the passed pointers past any scanlines successfully read.
-
-Just as with compression, the decompressor will typically backtrack to a
-convenient restart point before suspending.  When fill_input_buffer() is
-called, next_input_byte/bytes_in_buffer point to the current restart point,
-which is where the decompressor will backtrack to if FALSE is returned.
-The data beyond that position must NOT be discarded if you suspend; it needs
-to be re-read upon resumption.  In most implementations, you'll need to shift
-this data down to the start of your work buffer and then load more data after
-it.  Again, this behavior means that a several-Kbyte work buffer is essential
-for decent performance; furthermore, you should load a reasonable amount of
-new data before resuming decompression.  (If you loaded, say, only one new
-byte each time around, you could waste a LOT of cycles.)
-
-The skip_input_data() source manager routine requires special care in a
-suspension scenario.  This routine is NOT granted the ability to suspend the
-decompressor; it can decrement bytes_in_buffer to zero, but no more.  If the
-requested skip distance exceeds the amount of data currently in the input
-buffer, then skip_input_data() must set bytes_in_buffer to zero and record the
-additional skip distance somewhere else.  The decompressor will immediately
-call fill_input_buffer(), which should return FALSE, which will cause a
-suspension return.  The surrounding application must then arrange to discard
-the recorded number of bytes before it resumes loading the input buffer.
-(Yes, this design is rather baroque, but it avoids complexity in the far more
-common case where a non-suspending source manager is used.)
-
-If the input data has been exhausted, we recommend that you emit a warning
-and insert dummy EOI markers just as a non-suspending data source manager
-would do.  This can be handled either in the surrounding application logic or
-within fill_input_buffer(); the latter is probably more efficient.  If
-fill_input_buffer() knows that no more data is available, it can set the
-pointer/count to point to a dummy EOI marker and then return TRUE just as
-though it had read more data in a non-suspending situation.
-
-The decompressor does not attempt to suspend within standard JPEG markers;
-instead it will backtrack to the start of the marker and reprocess the whole
-marker next time.  Hence the input buffer must be large enough to hold the
-longest standard marker in the file.  Standard JPEG markers should normally
-not exceed a few hundred bytes each (DHT tables are typically the longest).
-We recommend at least a 2K buffer for performance reasons, which is much
-larger than any correct marker is likely to be.  For robustness against
-damaged marker length counts, you may wish to insert a test in your
-application for the case that the input buffer is completely full and yet
-the decoder has suspended without consuming any data --- otherwise, if this
-situation did occur, it would lead to an endless loop.  (The library can't
-provide this test since it has no idea whether "the buffer is full", or
-even whether there is a fixed-size input buffer.)
-
-The input buffer would need to be 64K to allow for arbitrary COM or APPn
-markers, but these are handled specially: they are either saved into allocated
-memory, or skipped over by calling skip_input_data().  In the former case,
-suspension is handled correctly, and in the latter case, the problem of
-buffer overrun is placed on skip_input_data's shoulders, as explained above.
-Note that if you provide your own marker handling routine for large markers,
-you should consider how to deal with buffer overflow.
-
-Multiple-buffer management:
-
-In some applications it is desirable to store the compressed data in a linked
-list of buffer areas, so as to avoid data copying.  This can be handled by
-having empty_output_buffer() or fill_input_buffer() set the pointer and count
-to reference the next available buffer; FALSE is returned only if no more
-buffers are available.  Although seemingly straightforward, there is a
-pitfall in this approach: the backtrack that occurs when FALSE is returned
-could back up into an earlier buffer.  For example, when fill_input_buffer()
-is called, the current pointer & count indicate the backtrack restart point.
-Since fill_input_buffer() will set the pointer and count to refer to a new
-buffer, the restart position must be saved somewhere else.  Suppose a second
-call to fill_input_buffer() occurs in the same library call, and no
-additional input data is available, so fill_input_buffer must return FALSE.
-If the JPEG library has not moved the pointer/count forward in the current
-buffer, then *the correct restart point is the saved position in the prior
-buffer*.  Prior buffers may be discarded only after the library establishes
-a restart point within a later buffer.  Similar remarks apply for output into
-a chain of buffers.
-
-The library will never attempt to backtrack over a skip_input_data() call,
-so any skipped data can be permanently discarded.  You still have to deal
-with the case of skipping not-yet-received data, however.
-
-It's much simpler to use only a single buffer; when fill_input_buffer() is
-called, move any unconsumed data (beyond the current pointer/count) down to
-the beginning of this buffer and then load new data into the remaining buffer
-space.  This approach requires a little more data copying but is far easier
-to get right.
-
-
-Progressive JPEG support
-------------------------
-
-Progressive JPEG rearranges the stored data into a series of scans of
-increasing quality.  In situations where a JPEG file is transmitted across a
-slow communications link, a decoder can generate a low-quality image very
-quickly from the first scan, then gradually improve the displayed quality as
-more scans are received.  The final image after all scans are complete is
-identical to that of a regular (sequential) JPEG file of the same quality
-setting.  Progressive JPEG files are often slightly smaller than equivalent
-sequential JPEG files, but the possibility of incremental display is the main
-reason for using progressive JPEG.
-
-The IJG encoder library generates progressive JPEG files when given a
-suitable "scan script" defining how to divide the data into scans.
-Creation of progressive JPEG files is otherwise transparent to the encoder.
-Progressive JPEG files can also be read transparently by the decoder library.
-If the decoding application simply uses the library as defined above, it
-will receive a final decoded image without any indication that the file was
-progressive.  Of course, this approach does not allow incremental display.
-To perform incremental display, an application needs to use the decoder
-library's "buffered-image" mode, in which it receives a decoded image
-multiple times.
-
-Each displayed scan requires about as much work to decode as a full JPEG
-image of the same size, so the decoder must be fairly fast in relation to the
-data transmission rate in order to make incremental display useful.  However,
-it is possible to skip displaying the image and simply add the incoming bits
-to the decoder's coefficient buffer.  This is fast because only Huffman
-decoding need be done, not IDCT, upsampling, colorspace conversion, etc.
-The IJG decoder library allows the application to switch dynamically between
-displaying the image and simply absorbing the incoming bits.  A properly
-coded application can automatically adapt the number of display passes to
-suit the time available as the image is received.  Also, a final
-higher-quality display cycle can be performed from the buffered data after
-the end of the file is reached.
-
-Progressive compression:
-
-To create a progressive JPEG file (or a multiple-scan sequential JPEG file),
-set the scan_info cinfo field to point to an array of scan descriptors, and
-perform compression as usual.  Instead of constructing your own scan list,
-you can call the jpeg_simple_progression() helper routine to create a
-recommended progression sequence; this method should be used by all
-applications that don't want to get involved in the nitty-gritty of
-progressive scan sequence design.  (If you want to provide user control of
-scan sequences, you may wish to borrow the scan script reading code found
-in rdswitch.c, so that you can read scan script files just like cjpeg's.)
-When scan_info is not NULL, the compression library will store DCT'd data
-into a buffer array as jpeg_write_scanlines() is called, and will emit all
-the requested scans during jpeg_finish_compress().  This implies that
-multiple-scan output cannot be created with a suspending data destination
-manager, since jpeg_finish_compress() does not support suspension.  We
-should also note that the compressor currently forces Huffman optimization
-mode when creating a progressive JPEG file, because the default Huffman
-tables are unsuitable for progressive files.
-
-Progressive decompression:
-
-When buffered-image mode is not used, the decoder library will read all of
-a multi-scan file during jpeg_start_decompress(), so that it can provide a
-final decoded image.  (Here "multi-scan" means either progressive or
-multi-scan sequential.)  This makes multi-scan files transparent to the
-decoding application.  However, existing applications that used suspending
-input with version 5 of the IJG library will need to be modified to check
-for a suspension return from jpeg_start_decompress().
-
-To perform incremental display, an application must use the library's
-buffered-image mode.  This is described in the next section.
-
-
-Buffered-image mode
--------------------
-
-In buffered-image mode, the library stores the partially decoded image in a
-coefficient buffer, from which it can be read out as many times as desired.
-This mode is typically used for incremental display of progressive JPEG files,
-but it can be used with any JPEG file.  Each scan of a progressive JPEG file
-adds more data (more detail) to the buffered image.  The application can
-display in lockstep with the source file (one display pass per input scan),
-or it can allow input processing to outrun display processing.  By making
-input and display processing run independently, it is possible for the
-application to adapt progressive display to a wide range of data transmission
-rates.
-
-The basic control flow for buffered-image decoding is
-
-	jpeg_create_decompress()
-	set data source
-	jpeg_read_header()
-	set overall decompression parameters
-	cinfo.buffered_image = TRUE;	/* select buffered-image mode */
-	jpeg_start_decompress()
-	for (each output pass) {
-	    adjust output decompression parameters if required
-	    jpeg_start_output()		/* start a new output pass */
-	    for (all scanlines in image) {
-	        jpeg_read_scanlines()
-	        display scanlines
-	    }
-	    jpeg_finish_output()	/* terminate output pass */
-	}
-	jpeg_finish_decompress()
-	jpeg_destroy_decompress()
-
-This differs from ordinary unbuffered decoding in that there is an additional
-level of looping.  The application can choose how many output passes to make
-and how to display each pass.
-
-The simplest approach to displaying progressive images is to do one display
-pass for each scan appearing in the input file.  In this case the outer loop
-condition is typically
-	while (! jpeg_input_complete(&cinfo))
-and the start-output call should read
-	jpeg_start_output(&cinfo, cinfo.input_scan_number);
-The second parameter to jpeg_start_output() indicates which scan of the input
-file is to be displayed; the scans are numbered starting at 1 for this
-purpose.  (You can use a loop counter starting at 1 if you like, but using
-the library's input scan counter is easier.)  The library automatically reads
-data as necessary to complete each requested scan, and jpeg_finish_output()
-advances to the next scan or end-of-image marker (hence input_scan_number
-will be incremented by the time control arrives back at jpeg_start_output()).
-With this technique, data is read from the input file only as needed, and
-input and output processing run in lockstep.
-
-After reading the final scan and reaching the end of the input file, the
-buffered image remains available; it can be read additional times by
-repeating the jpeg_start_output()/jpeg_read_scanlines()/jpeg_finish_output()
-sequence.  For example, a useful technique is to use fast one-pass color
-quantization for display passes made while the image is arriving, followed by
-a final display pass using two-pass quantization for highest quality.  This
-is done by changing the library parameters before the final output pass.
-Changing parameters between passes is discussed in detail below.
-
-In general the last scan of a progressive file cannot be recognized as such
-until after it is read, so a post-input display pass is the best approach if
-you want special processing in the final pass.
-
-When done with the image, be sure to call jpeg_finish_decompress() to release
-the buffered image (or just use jpeg_destroy_decompress()).
-
-If input data arrives faster than it can be displayed, the application can
-cause the library to decode input data in advance of what's needed to produce
-output.  This is done by calling the routine jpeg_consume_input().
-The return value is one of the following:
-	JPEG_REACHED_SOS:    reached an SOS marker (the start of a new scan)
-	JPEG_REACHED_EOI:    reached the EOI marker (end of image)
-	JPEG_ROW_COMPLETED:  completed reading one MCU row of compressed data
-	JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan
-	JPEG_SUSPENDED:      suspended before completing any of the above
-(JPEG_SUSPENDED can occur only if a suspending data source is used.)  This
-routine can be called at any time after initializing the JPEG object.  It
-reads some additional data and returns when one of the indicated significant
-events occurs.  (If called after the EOI marker is reached, it will
-immediately return JPEG_REACHED_EOI without attempting to read more data.)
-
-The library's output processing will automatically call jpeg_consume_input()
-whenever the output processing overtakes the input; thus, simple lockstep
-display requires no direct calls to jpeg_consume_input().  But by adding
-calls to jpeg_consume_input(), you can absorb data in advance of what is
-being displayed.  This has two benefits:
-  * You can limit buildup of unprocessed data in your input buffer.
-  * You can eliminate extra display passes by paying attention to the
-    state of the library's input processing.
-
-The first of these benefits only requires interspersing calls to
-jpeg_consume_input() with your display operations and any other processing
-you may be doing.  To avoid wasting cycles due to backtracking, it's best to
-call jpeg_consume_input() only after a hundred or so new bytes have arrived.
-This is discussed further under "I/O suspension", above.  (Note: the JPEG
-library currently is not thread-safe.  You must not call jpeg_consume_input()
-from one thread of control if a different library routine is working on the
-same JPEG object in another thread.)
-
-When input arrives fast enough that more than one new scan is available
-before you start a new output pass, you may as well skip the output pass
-corresponding to the completed scan.  This occurs for free if you pass
-cinfo.input_scan_number as the target scan number to jpeg_start_output().
-The input_scan_number field is simply the index of the scan currently being
-consumed by the input processor.  You can ensure that this is up-to-date by
-emptying the input buffer just before calling jpeg_start_output(): call
-jpeg_consume_input() repeatedly until it returns JPEG_SUSPENDED or
-JPEG_REACHED_EOI.
-
-The target scan number passed to jpeg_start_output() is saved in the
-cinfo.output_scan_number field.  The library's output processing calls
-jpeg_consume_input() whenever the current input scan number and row within
-that scan is less than or equal to the current output scan number and row.
-Thus, input processing can "get ahead" of the output processing but is not
-allowed to "fall behind".  You can achieve several different effects by
-manipulating this interlock rule.  For example, if you pass a target scan
-number greater than the current input scan number, the output processor will
-wait until that scan starts to arrive before producing any output.  (To avoid
-an infinite loop, the target scan number is automatically reset to the last
-scan number when the end of image is reached.  Thus, if you specify a large
-target scan number, the library will just absorb the entire input file and
-then perform an output pass.  This is effectively the same as what
-jpeg_start_decompress() does when you don't select buffered-image mode.)
-When you pass a target scan number equal to the current input scan number,
-the image is displayed no faster than the current input scan arrives.  The
-final possibility is to pass a target scan number less than the current input
-scan number; this disables the input/output interlock and causes the output
-processor to simply display whatever it finds in the image buffer, without
-waiting for input.  (However, the library will not accept a target scan
-number less than one, so you can't avoid waiting for the first scan.)
-
-When data is arriving faster than the output display processing can advance
-through the image, jpeg_consume_input() will store data into the buffered
-image beyond the point at which the output processing is reading data out
-again.  If the input arrives fast enough, it may "wrap around" the buffer to
-the point where the input is more than one whole scan ahead of the output.
-If the output processing simply proceeds through its display pass without
-paying attention to the input, the effect seen on-screen is that the lower
-part of the image is one or more scans better in quality than the upper part.
-Then, when the next output scan is started, you have a choice of what target
-scan number to use.  The recommended choice is to use the current input scan
-number at that time, which implies that you've skipped the output scans
-corresponding to the input scans that were completed while you processed the
-previous output scan.  In this way, the decoder automatically adapts its
-speed to the arriving data, by skipping output scans as necessary to keep up
-with the arriving data.
-
-When using this strategy, you'll want to be sure that you perform a final
-output pass after receiving all the data; otherwise your last display may not
-be full quality across the whole screen.  So the right outer loop logic is
-something like this:
-	do {
-	    absorb any waiting input by calling jpeg_consume_input()
-	    final_pass = jpeg_input_complete(&cinfo);
-	    adjust output decompression parameters if required
-	    jpeg_start_output(&cinfo, cinfo.input_scan_number);
-	    ...
-	    jpeg_finish_output()
-	} while (! final_pass);
-rather than quitting as soon as jpeg_input_complete() returns TRUE.  This
-arrangement makes it simple to use higher-quality decoding parameters
-for the final pass.  But if you don't want to use special parameters for
-the final pass, the right loop logic is like this:
-	for (;;) {
-	    absorb any waiting input by calling jpeg_consume_input()
-	    jpeg_start_output(&cinfo, cinfo.input_scan_number);
-	    ...
-	    jpeg_finish_output()
-	    if (jpeg_input_complete(&cinfo) &&
-	        cinfo.input_scan_number == cinfo.output_scan_number)
-	      break;
-	}
-In this case you don't need to know in advance whether an output pass is to
-be the last one, so it's not necessary to have reached EOF before starting
-the final output pass; rather, what you want to test is whether the output
-pass was performed in sync with the final input scan.  This form of the loop
-will avoid an extra output pass whenever the decoder is able (or nearly able)
-to keep up with the incoming data.
-
-When the data transmission speed is high, you might begin a display pass,
-then find that much or all of the file has arrived before you can complete
-the pass.  (You can detect this by noting the JPEG_REACHED_EOI return code
-from jpeg_consume_input(), or equivalently by testing jpeg_input_complete().)
-In this situation you may wish to abort the current display pass and start a
-new one using the newly arrived information.  To do so, just call
-jpeg_finish_output() and then start a new pass with jpeg_start_output().
-
-A variant strategy is to abort and restart display if more than one complete
-scan arrives during an output pass; this can be detected by noting
-JPEG_REACHED_SOS returns and/or examining cinfo.input_scan_number.  This
-idea should be employed with caution, however, since the display process
-might never get to the bottom of the image before being aborted, resulting
-in the lower part of the screen being several passes worse than the upper.
-In most cases it's probably best to abort an output pass only if the whole
-file has arrived and you want to begin the final output pass immediately.
-
-When receiving data across a communication link, we recommend always using
-the current input scan number for the output target scan number; if a
-higher-quality final pass is to be done, it should be started (aborting any
-incomplete output pass) as soon as the end of file is received.  However,
-many other strategies are possible.  For example, the application can examine
-the parameters of the current input scan and decide whether to display it or
-not.  If the scan contains only chroma data, one might choose not to use it
-as the target scan, expecting that the scan will be small and will arrive
-quickly.  To skip to the next scan, call jpeg_consume_input() until it
-returns JPEG_REACHED_SOS or JPEG_REACHED_EOI.  Or just use the next higher
-number as the target scan for jpeg_start_output(); but that method doesn't
-let you inspect the next scan's parameters before deciding to display it.
-
-
-In buffered-image mode, jpeg_start_decompress() never performs input and
-thus never suspends.  An application that uses input suspension with
-buffered-image mode must be prepared for suspension returns from these
-routines:
-* jpeg_start_output() performs input only if you request 2-pass quantization
-  and the target scan isn't fully read yet.  (This is discussed below.)
-* jpeg_read_scanlines(), as always, returns the number of scanlines that it
-  was able to produce before suspending.
-* jpeg_finish_output() will read any markers following the target scan,
-  up to the end of the file or the SOS marker that begins another scan.
-  (But it reads no input if jpeg_consume_input() has already reached the
-  end of the file or a SOS marker beyond the target output scan.)
-* jpeg_finish_decompress() will read until the end of file, and thus can
-  suspend if the end hasn't already been reached (as can be tested by
-  calling jpeg_input_complete()).
-jpeg_start_output(), jpeg_finish_output(), and jpeg_finish_decompress()
-all return TRUE if they completed their tasks, FALSE if they had to suspend.
-In the event of a FALSE return, the application must load more input data
-and repeat the call.  Applications that use non-suspending data sources need
-not check the return values of these three routines.
-
-
-It is possible to change decoding parameters between output passes in the
-buffered-image mode.  The decoder library currently supports only very
-limited changes of parameters.  ONLY THE FOLLOWING parameter changes are
-allowed after jpeg_start_decompress() is called:
-* dct_method can be changed before each call to jpeg_start_output().
-  For example, one could use a fast DCT method for early scans, changing
-  to a higher quality method for the final scan.
-* dither_mode can be changed before each call to jpeg_start_output();
-  of course this has no impact if not using color quantization.  Typically
-  one would use ordered dither for initial passes, then switch to
-  Floyd-Steinberg dither for the final pass.  Caution: changing dither mode
-  can cause more memory to be allocated by the library.  Although the amount
-  of memory involved is not large (a scanline or so), it may cause the
-  initial max_memory_to_use specification to be exceeded, which in the worst
-  case would result in an out-of-memory failure.
-* do_block_smoothing can be changed before each call to jpeg_start_output().
-  This setting is relevant only when decoding a progressive JPEG image.
-  During the first DC-only scan, block smoothing provides a very "fuzzy" look
-  instead of the very "blocky" look seen without it; which is better seems a
-  matter of personal taste.  But block smoothing is nearly always a win
-  during later stages, especially when decoding a successive-approximation
-  image: smoothing helps to hide the slight blockiness that otherwise shows
-  up on smooth gradients until the lowest coefficient bits are sent.
-* Color quantization mode can be changed under the rules described below.
-  You *cannot* change between full-color and quantized output (because that
-  would alter the required I/O buffer sizes), but you can change which
-  quantization method is used.
-
-When generating color-quantized output, changing quantization method is a
-very useful way of switching between high-speed and high-quality display.
-The library allows you to change among its three quantization methods:
-1. Single-pass quantization to a fixed color cube.
-   Selected by cinfo.two_pass_quantize = FALSE and cinfo.colormap = NULL.
-2. Single-pass quantization to an application-supplied colormap.
-   Selected by setting cinfo.colormap to point to the colormap (the value of
-   two_pass_quantize is ignored); also set cinfo.actual_number_of_colors.
-3. Two-pass quantization to a colormap chosen specifically for the image.
-   Selected by cinfo.two_pass_quantize = TRUE and cinfo.colormap = NULL.
-   (This is the default setting selected by jpeg_read_header, but it is
-   probably NOT what you want for the first pass of progressive display!)
-These methods offer successively better quality and lesser speed.  However,
-only the first method is available for quantizing in non-RGB color spaces.
-
-IMPORTANT: because the different quantizer methods have very different
-working-storage requirements, the library requires you to indicate which
-one(s) you intend to use before you call jpeg_start_decompress().  (If we did
-not require this, the max_memory_to_use setting would be a complete fiction.)
-You do this by setting one or more of these three cinfo fields to TRUE:
-	enable_1pass_quant		Fixed color cube colormap
-	enable_external_quant		Externally-supplied colormap
-	enable_2pass_quant		Two-pass custom colormap
-All three are initialized FALSE by jpeg_read_header().  But
-jpeg_start_decompress() automatically sets TRUE the one selected by the
-current two_pass_quantize and colormap settings, so you only need to set the
-enable flags for any other quantization methods you plan to change to later.
-
-After setting the enable flags correctly at jpeg_start_decompress() time, you
-can change to any enabled quantization method by setting two_pass_quantize
-and colormap properly just before calling jpeg_start_output().  The following
-special rules apply:
-1. You must explicitly set cinfo.colormap to NULL when switching to 1-pass
-   or 2-pass mode from a different mode, or when you want the 2-pass
-   quantizer to be re-run to generate a new colormap.
-2. To switch to an external colormap, or to change to a different external
-   colormap than was used on the prior pass, you must call
-   jpeg_new_colormap() after setting cinfo.colormap.
-NOTE: if you want to use the same colormap as was used in the prior pass,
-you should not do either of these things.  This will save some nontrivial
-switchover costs.
-(These requirements exist because cinfo.colormap will always be non-NULL
-after completing a prior output pass, since both the 1-pass and 2-pass
-quantizers set it to point to their output colormaps.  Thus you have to
-do one of these two things to notify the library that something has changed.
-Yup, it's a bit klugy, but it's necessary to do it this way for backwards
-compatibility.)
-
-Note that in buffered-image mode, the library generates any requested colormap
-during jpeg_start_output(), not during jpeg_start_decompress().
-
-When using two-pass quantization, jpeg_start_output() makes a pass over the
-buffered image to determine the optimum color map; it therefore may take a
-significant amount of time, whereas ordinarily it does little work.  The
-progress monitor hook is called during this pass, if defined.  It is also
-important to realize that if the specified target scan number is greater than
-or equal to the current input scan number, jpeg_start_output() will attempt
-to consume input as it makes this pass.  If you use a suspending data source,
-you need to check for a FALSE return from jpeg_start_output() under these
-conditions.  The combination of 2-pass quantization and a not-yet-fully-read
-target scan is the only case in which jpeg_start_output() will consume input.
-
-
-Application authors who support buffered-image mode may be tempted to use it
-for all JPEG images, even single-scan ones.  This will work, but it is
-inefficient: there is no need to create an image-sized coefficient buffer for
-single-scan images.  Requesting buffered-image mode for such an image wastes
-memory.  Worse, it can cost time on large images, since the buffered data has
-to be swapped out or written to a temporary file.  If you are concerned about
-maximum performance on baseline JPEG files, you should use buffered-image
-mode only when the incoming file actually has multiple scans.  This can be
-tested by calling jpeg_has_multiple_scans(), which will return a correct
-result at any time after jpeg_read_header() completes.
-
-It is also worth noting that when you use jpeg_consume_input() to let input
-processing get ahead of output processing, the resulting pattern of access to
-the coefficient buffer is quite nonsequential.  It's best to use the memory
-manager jmemnobs.c if you can (ie, if you have enough real or virtual main
-memory).  If not, at least make sure that max_memory_to_use is set as high as
-possible.  If the JPEG memory manager has to use a temporary file, you will
-probably see a lot of disk traffic and poor performance.  (This could be
-improved with additional work on the memory manager, but we haven't gotten
-around to it yet.)
-
-In some applications it may be convenient to use jpeg_consume_input() for all
-input processing, including reading the initial markers; that is, you may
-wish to call jpeg_consume_input() instead of jpeg_read_header() during
-startup.  This works, but note that you must check for JPEG_REACHED_SOS and
-JPEG_REACHED_EOI return codes as the equivalent of jpeg_read_header's codes.
-Once the first SOS marker has been reached, you must call
-jpeg_start_decompress() before jpeg_consume_input() will consume more input;
-it'll just keep returning JPEG_REACHED_SOS until you do.  If you read a
-tables-only file this way, jpeg_consume_input() will return JPEG_REACHED_EOI
-without ever returning JPEG_REACHED_SOS; be sure to check for this case.
-If this happens, the decompressor will not read any more input until you call
-jpeg_abort() to reset it.  It is OK to call jpeg_consume_input() even when not
-using buffered-image mode, but in that case it's basically a no-op after the
-initial markers have been read: it will just return JPEG_SUSPENDED.
-
-
-Abbreviated datastreams and multiple images
--------------------------------------------
-
-A JPEG compression or decompression object can be reused to process multiple
-images.  This saves a small amount of time per image by eliminating the
-"create" and "destroy" operations, but that isn't the real purpose of the
-feature.  Rather, reuse of an object provides support for abbreviated JPEG
-datastreams.  Object reuse can also simplify processing a series of images in
-a single input or output file.  This section explains these features.
-
-A JPEG file normally contains several hundred bytes worth of quantization
-and Huffman tables.  In a situation where many images will be stored or
-transmitted with identical tables, this may represent an annoying overhead.
-The JPEG standard therefore permits tables to be omitted.  The standard
-defines three classes of JPEG datastreams:
-  * "Interchange" datastreams contain an image and all tables needed to decode
-     the image.  These are the usual kind of JPEG file.
-  * "Abbreviated image" datastreams contain an image, but are missing some or
-    all of the tables needed to decode that image.
-  * "Abbreviated table specification" (henceforth "tables-only") datastreams
-    contain only table specifications.
-To decode an abbreviated image, it is necessary to load the missing table(s)
-into the decoder beforehand.  This can be accomplished by reading a separate
-tables-only file.  A variant scheme uses a series of images in which the first
-image is an interchange (complete) datastream, while subsequent ones are
-abbreviated and rely on the tables loaded by the first image.  It is assumed
-that once the decoder has read a table, it will remember that table until a
-new definition for the same table number is encountered.
-
-It is the application designer's responsibility to figure out how to associate
-the correct tables with an abbreviated image.  While abbreviated datastreams
-can be useful in a closed environment, their use is strongly discouraged in
-any situation where data exchange with other applications might be needed.
-Caveat designer.
-
-The JPEG library provides support for reading and writing any combination of
-tables-only datastreams and abbreviated images.  In both compression and
-decompression objects, a quantization or Huffman table will be retained for
-the lifetime of the object, unless it is overwritten by a new table definition.
-
-
-To create abbreviated image datastreams, it is only necessary to tell the
-compressor not to emit some or all of the tables it is using.  Each
-quantization and Huffman table struct contains a boolean field "sent_table",
-which normally is initialized to FALSE.  For each table used by the image, the
-header-writing process emits the table and sets sent_table = TRUE unless it is
-already TRUE.  (In normal usage, this prevents outputting the same table
-definition multiple times, as would otherwise occur because the chroma
-components typically share tables.)  Thus, setting this field to TRUE before
-calling jpeg_start_compress() will prevent the table from being written at
-all.
-
-If you want to create a "pure" abbreviated image file containing no tables,
-just call "jpeg_suppress_tables(&cinfo, TRUE)" after constructing all the
-tables.  If you want to emit some but not all tables, you'll need to set the
-individual sent_table fields directly.
-
-To create an abbreviated image, you must also call jpeg_start_compress()
-with a second parameter of FALSE, not TRUE.  Otherwise jpeg_start_compress()
-will force all the sent_table fields to FALSE.  (This is a safety feature to
-prevent abbreviated images from being created accidentally.)
-
-To create a tables-only file, perform the same parameter setup that you
-normally would, but instead of calling jpeg_start_compress() and so on, call
-jpeg_write_tables(&cinfo).  This will write an abbreviated datastream
-containing only SOI, DQT and/or DHT markers, and EOI.  All the quantization
-and Huffman tables that are currently defined in the compression object will
-be emitted unless their sent_tables flag is already TRUE, and then all the
-sent_tables flags will be set TRUE.
-
-A sure-fire way to create matching tables-only and abbreviated image files
-is to proceed as follows:
-
-	create JPEG compression object
-	set JPEG parameters
-	set destination to tables-only file
-	jpeg_write_tables(&cinfo);
-	set destination to image file
-	jpeg_start_compress(&cinfo, FALSE);
-	write data...
-	jpeg_finish_compress(&cinfo);
-
-Since the JPEG parameters are not altered between writing the table file and
-the abbreviated image file, the same tables are sure to be used.  Of course,
-you can repeat the jpeg_start_compress() ... jpeg_finish_compress() sequence
-many times to produce many abbreviated image files matching the table file.
-
-You cannot suppress output of the computed Huffman tables when Huffman
-optimization is selected.  (If you could, there'd be no way to decode the
-image...)  Generally, you don't want to set optimize_coding = TRUE when
-you are trying to produce abbreviated files.
-
-In some cases you might want to compress an image using tables which are
-not stored in the application, but are defined in an interchange or
-tables-only file readable by the application.  This can be done by setting up
-a JPEG decompression object to read the specification file, then copying the
-tables into your compression object.  See jpeg_copy_critical_parameters()
-for an example of copying quantization tables.
-
-
-To read abbreviated image files, you simply need to load the proper tables
-into the decompression object before trying to read the abbreviated image.
-If the proper tables are stored in the application program, you can just
-allocate the table structs and fill in their contents directly.  For example,
-to load a fixed quantization table into table slot "n":
-
-    if (cinfo.quant_tbl_ptrs[n] == NULL)
-      cinfo.quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) &cinfo);
-    quant_ptr = cinfo.quant_tbl_ptrs[n];	/* quant_ptr is JQUANT_TBL* */
-    for (i = 0; i < 64; i++) {
-      /* Qtable[] is desired quantization table, in natural array order */
-      quant_ptr->quantval[i] = Qtable[i];
-    }
-
-Code to load a fixed Huffman table is typically (for AC table "n"):
-
-    if (cinfo.ac_huff_tbl_ptrs[n] == NULL)
-      cinfo.ac_huff_tbl_ptrs[n] = jpeg_alloc_huff_table((j_common_ptr) &cinfo);
-    huff_ptr = cinfo.ac_huff_tbl_ptrs[n];	/* huff_ptr is JHUFF_TBL* */
-    for (i = 1; i <= 16; i++) {
-      /* counts[i] is number of Huffman codes of length i bits, i=1..16 */
-      huff_ptr->bits[i] = counts[i];
-    }
-    for (i = 0; i < 256; i++) {
-      /* symbols[] is the list of Huffman symbols, in code-length order */
-      huff_ptr->huffval[i] = symbols[i];
-    }
-
-(Note that trying to set cinfo.quant_tbl_ptrs[n] to point directly at a
-constant JQUANT_TBL object is not safe.  If the incoming file happened to
-contain a quantization table definition, your master table would get
-overwritten!  Instead allocate a working table copy and copy the master table
-into it, as illustrated above.  Ditto for Huffman tables, of course.)
-
-You might want to read the tables from a tables-only file, rather than
-hard-wiring them into your application.  The jpeg_read_header() call is
-sufficient to read a tables-only file.  You must pass a second parameter of
-FALSE to indicate that you do not require an image to be present.  Thus, the
-typical scenario is
-
-	create JPEG decompression object
-	set source to tables-only file
-	jpeg_read_header(&cinfo, FALSE);
-	set source to abbreviated image file
-	jpeg_read_header(&cinfo, TRUE);
-	set decompression parameters
-	jpeg_start_decompress(&cinfo);
-	read data...
-	jpeg_finish_decompress(&cinfo);
-
-In some cases, you may want to read a file without knowing whether it contains
-an image or just tables.  In that case, pass FALSE and check the return value
-from jpeg_read_header(): it will be JPEG_HEADER_OK if an image was found,
-JPEG_HEADER_TABLES_ONLY if only tables were found.  (A third return value,
-JPEG_SUSPENDED, is possible when using a suspending data source manager.)
-Note that jpeg_read_header() will not complain if you read an abbreviated
-image for which you haven't loaded the missing tables; the missing-table check
-occurs later, in jpeg_start_decompress().
-
-
-It is possible to read a series of images from a single source file by
-repeating the jpeg_read_header() ... jpeg_finish_decompress() sequence,
-without releasing/recreating the JPEG object or the data source module.
-(If you did reinitialize, any partial bufferload left in the data source
-buffer at the end of one image would be discarded, causing you to lose the
-start of the next image.)  When you use this method, stored tables are
-automatically carried forward, so some of the images can be abbreviated images
-that depend on tables from earlier images.
-
-If you intend to write a series of images into a single destination file,
-you might want to make a specialized data destination module that doesn't
-flush the output buffer at term_destination() time.  This would speed things
-up by some trifling amount.  Of course, you'd need to remember to flush the
-buffer after the last image.  You can make the later images be abbreviated
-ones by passing FALSE to jpeg_start_compress().
-
-
-Special markers
----------------
-
-Some applications may need to insert or extract special data in the JPEG
-datastream.  The JPEG standard provides marker types "COM" (comment) and
-"APP0" through "APP15" (application) to hold application-specific data.
-Unfortunately, the use of these markers is not specified by the standard.
-COM markers are fairly widely used to hold user-supplied text.  The JFIF file
-format spec uses APP0 markers with specified initial strings to hold certain
-data.  Adobe applications use APP14 markers beginning with the string "Adobe"
-for miscellaneous data.  Other APPn markers are rarely seen, but might
-contain almost anything.
-
-If you wish to store user-supplied text, we recommend you use COM markers
-and place readable 7-bit ASCII text in them.  Newline conventions are not
-standardized --- expect to find LF (Unix style), CR/LF (DOS style), or CR
-(Mac style).  A robust COM reader should be able to cope with random binary
-garbage, including nulls, since some applications generate COM markers
-containing non-ASCII junk.  (But yours should not be one of them.)
-
-For program-supplied data, use an APPn marker, and be sure to begin it with an
-identifying string so that you can tell whether the marker is actually yours.
-It's probably best to avoid using APP0 or APP14 for any private markers.
-(NOTE: the upcoming SPIFF standard will use APP8 markers; we recommend you
-not use APP8 markers for any private purposes, either.)
-
-Keep in mind that at most 65533 bytes can be put into one marker, but you
-can have as many markers as you like.
-
-By default, the IJG compression library will write a JFIF APP0 marker if the
-selected JPEG colorspace is grayscale or YCbCr, or an Adobe APP14 marker if
-the selected colorspace is RGB, CMYK, or YCCK.  You can disable this, but
-we don't recommend it.  The decompression library will recognize JFIF and
-Adobe markers and will set the JPEG colorspace properly when one is found.
-
-
-You can write special markers immediately following the datastream header by
-calling jpeg_write_marker() after jpeg_start_compress() and before the first
-call to jpeg_write_scanlines().  When you do this, the markers appear after
-the SOI and the JFIF APP0 and Adobe APP14 markers (if written), but before
-all else.  Specify the marker type parameter as "JPEG_COM" for COM or
-"JPEG_APP0 + n" for APPn.  (Actually, jpeg_write_marker will let you write
-any marker type, but we don't recommend writing any other kinds of marker.)
-For example, to write a user comment string pointed to by comment_text:
-	jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text));
-
-If it's not convenient to store all the marker data in memory at once,
-you can instead call jpeg_write_m_header() followed by multiple calls to
-jpeg_write_m_byte().  If you do it this way, it's your responsibility to
-call jpeg_write_m_byte() exactly the number of times given in the length
-parameter to jpeg_write_m_header().  (This method lets you empty the
-output buffer partway through a marker, which might be important when
-using a suspending data destination module.  In any case, if you are using
-a suspending destination, you should flush its buffer after inserting
-any special markers.  See "I/O suspension".)
-
-Or, if you prefer to synthesize the marker byte sequence yourself,
-you can just cram it straight into the data destination module.
-
-If you are writing JFIF 1.02 extension markers (thumbnail images), don't
-forget to set cinfo.JFIF_minor_version = 2 so that the encoder will write the
-correct JFIF version number in the JFIF header marker.  The library's default
-is to write version 1.01, but that's wrong if you insert any 1.02 extension
-markers.  (We could probably get away with just defaulting to 1.02, but there
-used to be broken decoders that would complain about unknown minor version
-numbers.  To reduce compatibility risks it's safest not to write 1.02 unless
-you are actually using 1.02 extensions.)
-
-
-When reading, two methods of handling special markers are available:
-1. You can ask the library to save the contents of COM and/or APPn markers
-into memory, and then examine them at your leisure afterwards.
-2. You can supply your own routine to process COM and/or APPn markers
-on-the-fly as they are read.
-The first method is simpler to use, especially if you are using a suspending
-data source; writing a marker processor that copes with input suspension is
-not easy (consider what happens if the marker is longer than your available
-input buffer).  However, the second method conserves memory since the marker
-data need not be kept around after it's been processed.
-
-For either method, you'd normally set up marker handling after creating a
-decompression object and before calling jpeg_read_header(), because the
-markers of interest will typically be near the head of the file and so will
-be scanned by jpeg_read_header.  Once you've established a marker handling
-method, it will be used for the life of that decompression object
-(potentially many datastreams), unless you change it.  Marker handling is
-determined separately for COM markers and for each APPn marker code.
-
-
-To save the contents of special markers in memory, call
-	jpeg_save_markers(cinfo, marker_code, length_limit)
-where marker_code is the marker type to save, JPEG_COM or JPEG_APP0+n.
-(To arrange to save all the special marker types, you need to call this
-routine 17 times, for COM and APP0-APP15.)  If the incoming marker is longer
-than length_limit data bytes, only length_limit bytes will be saved; this
-parameter allows you to avoid chewing up memory when you only need to see the
-first few bytes of a potentially large marker.  If you want to save all the
-data, set length_limit to 0xFFFF; that is enough since marker lengths are only
-16 bits.  As a special case, setting length_limit to 0 prevents that marker
-type from being saved at all.  (That is the default behavior, in fact.)
-
-After jpeg_read_header() completes, you can examine the special markers by
-following the cinfo->marker_list pointer chain.  All the special markers in
-the file appear in this list, in order of their occurrence in the file (but
-omitting any markers of types you didn't ask for).  Both the original data
-length and the saved data length are recorded for each list entry; the latter
-will not exceed length_limit for the particular marker type.  Note that these
-lengths exclude the marker length word, whereas the stored representation
-within the JPEG file includes it.  (Hence the maximum data length is really
-only 65533.)
-
-It is possible that additional special markers appear in the file beyond the
-SOS marker at which jpeg_read_header stops; if so, the marker list will be
-extended during reading of the rest of the file.  This is not expected to be
-common, however.  If you are short on memory you may want to reset the length
-limit to zero for all marker types after finishing jpeg_read_header, to
-ensure that the max_memory_to_use setting cannot be exceeded due to addition
-of later markers.
-
-The marker list remains stored until you call jpeg_finish_decompress or
-jpeg_abort, at which point the memory is freed and the list is set to empty.
-(jpeg_destroy also releases the storage, of course.)
-
-Note that the library is internally interested in APP0 and APP14 markers;
-if you try to set a small nonzero length limit on these types, the library
-will silently force the length up to the minimum it wants.  (But you can set
-a zero length limit to prevent them from being saved at all.)  Also, in a
-16-bit environment, the maximum length limit may be constrained to less than
-65533 by malloc() limitations.  It is therefore best not to assume that the
-effective length limit is exactly what you set it to be.
-
-
-If you want to supply your own marker-reading routine, you do it by calling
-jpeg_set_marker_processor().  A marker processor routine must have the
-signature
-	boolean jpeg_marker_parser_method (j_decompress_ptr cinfo)
-Although the marker code is not explicitly passed, the routine can find it
-in cinfo->unread_marker.  At the time of call, the marker proper has been
-read from the data source module.  The processor routine is responsible for
-reading the marker length word and the remaining parameter bytes, if any.
-Return TRUE to indicate success.  (FALSE should be returned only if you are
-using a suspending data source and it tells you to suspend.  See the standard
-marker processors in jdmarker.c for appropriate coding methods if you need to
-use a suspending data source.)
-
-If you override the default APP0 or APP14 processors, it is up to you to
-recognize JFIF and Adobe markers if you want colorspace recognition to occur
-properly.  We recommend copying and extending the default processors if you
-want to do that.  (A better idea is to save these marker types for later
-examination by calling jpeg_save_markers(); that method doesn't interfere
-with the library's own processing of these markers.)
-
-jpeg_set_marker_processor() and jpeg_save_markers() are mutually exclusive
---- if you call one it overrides any previous call to the other, for the
-particular marker type specified.
-
-A simple example of an external COM processor can be found in djpeg.c.
-Also, see jpegtran.c for an example of using jpeg_save_markers.
-
-
-Raw (downsampled) image data
-----------------------------
-
-Some applications need to supply already-downsampled image data to the JPEG
-compressor, or to receive raw downsampled data from the decompressor.  The
-library supports this requirement by allowing the application to write or
-read raw data, bypassing the normal preprocessing or postprocessing steps.
-The interface is different from the standard one and is somewhat harder to
-use.  If your interest is merely in bypassing color conversion, we recommend
-that you use the standard interface and simply set jpeg_color_space =
-in_color_space (or jpeg_color_space = out_color_space for decompression).
-The mechanism described in this section is necessary only to supply or
-receive downsampled image data, in which not all components have the same
-dimensions.
-
-
-To compress raw data, you must supply the data in the colorspace to be used
-in the JPEG file (please read the earlier section on Special color spaces)
-and downsampled to the sampling factors specified in the JPEG parameters.
-You must supply the data in the format used internally by the JPEG library,
-namely a JSAMPIMAGE array.  This is an array of pointers to two-dimensional
-arrays, each of type JSAMPARRAY.  Each 2-D array holds the values for one
-color component.  This structure is necessary since the components are of
-different sizes.  If the image dimensions are not a multiple of the MCU size,
-you must also pad the data correctly (usually, this is done by replicating
-the last column and/or row).  The data must be padded to a multiple of a DCT
-block in each component: that is, each downsampled row must contain a
-multiple of 8 valid samples, and there must be a multiple of 8 sample rows
-for each component.  (For applications such as conversion of digital TV
-images, the standard image size is usually a multiple of the DCT block size,
-so that no padding need actually be done.)
-
-The procedure for compression of raw data is basically the same as normal
-compression, except that you call jpeg_write_raw_data() in place of
-jpeg_write_scanlines().  Before calling jpeg_start_compress(), you must do
-the following:
-  * Set cinfo->raw_data_in to TRUE.  (It is set FALSE by jpeg_set_defaults().)
-    This notifies the library that you will be supplying raw data.
-  * Ensure jpeg_color_space is correct --- an explicit jpeg_set_colorspace()
-    call is a good idea.  Note that since color conversion is bypassed,
-    in_color_space is ignored, except that jpeg_set_defaults() uses it to
-    choose the default jpeg_color_space setting.
-  * Ensure the sampling factors, cinfo->comp_info[i].h_samp_factor and
-    cinfo->comp_info[i].v_samp_factor, are correct.  Since these indicate the
-    dimensions of the data you are supplying, it's wise to set them
-    explicitly, rather than assuming the library's defaults are what you want.
-
-To pass raw data to the library, call jpeg_write_raw_data() in place of
-jpeg_write_scanlines().  The two routines work similarly except that
-jpeg_write_raw_data takes a JSAMPIMAGE data array rather than JSAMPARRAY.
-The scanlines count passed to and returned from jpeg_write_raw_data is
-measured in terms of the component with the largest v_samp_factor.
-
-jpeg_write_raw_data() processes one MCU row per call, which is to say
-v_samp_factor*DCTSIZE sample rows of each component.  The passed num_lines
-value must be at least max_v_samp_factor*DCTSIZE, and the return value will
-be exactly that amount (or possibly some multiple of that amount, in future
-library versions).  This is true even on the last call at the bottom of the
-image; don't forget to pad your data as necessary.
-
-The required dimensions of the supplied data can be computed for each
-component as
-	cinfo->comp_info[i].width_in_blocks*DCTSIZE  samples per row
-	cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image
-after jpeg_start_compress() has initialized those fields.  If the valid data
-is smaller than this, it must be padded appropriately.  For some sampling
-factors and image sizes, additional dummy DCT blocks are inserted to make
-the image a multiple of the MCU dimensions.  The library creates such dummy
-blocks itself; it does not read them from your supplied data.  Therefore you
-need never pad by more than DCTSIZE samples.  An example may help here.
-Assume 2h2v downsampling of YCbCr data, that is
-	cinfo->comp_info[0].h_samp_factor = 2		for Y
-	cinfo->comp_info[0].v_samp_factor = 2
-	cinfo->comp_info[1].h_samp_factor = 1		for Cb
-	cinfo->comp_info[1].v_samp_factor = 1
-	cinfo->comp_info[2].h_samp_factor = 1		for Cr
-	cinfo->comp_info[2].v_samp_factor = 1
-and suppose that the nominal image dimensions (cinfo->image_width and
-cinfo->image_height) are 101x101 pixels.  Then jpeg_start_compress() will
-compute downsampled_width = 101 and width_in_blocks = 13 for Y,
-downsampled_width = 51 and width_in_blocks = 7 for Cb and Cr (and the same
-for the height fields).  You must pad the Y data to at least 13*8 = 104
-columns and rows, the Cb/Cr data to at least 7*8 = 56 columns and rows.  The
-MCU height is max_v_samp_factor = 2 DCT rows so you must pass at least 16
-scanlines on each call to jpeg_write_raw_data(), which is to say 16 actual
-sample rows of Y and 8 each of Cb and Cr.  A total of 7 MCU rows are needed,
-so you must pass a total of 7*16 = 112 "scanlines".  The last DCT block row
-of Y data is dummy, so it doesn't matter what you pass for it in the data
-arrays, but the scanlines count must total up to 112 so that all of the Cb
-and Cr data gets passed.
-
-Output suspension is supported with raw-data compression: if the data
-destination module suspends, jpeg_write_raw_data() will return 0.
-In this case the same data rows must be passed again on the next call.
-
-
-Decompression with raw data output implies bypassing all postprocessing:
-you cannot ask for rescaling or color quantization, for instance.  More
-seriously, you must deal with the color space and sampling factors present in
-the incoming file.  If your application only handles, say, 2h1v YCbCr data,
-you must check for and fail on other color spaces or other sampling factors.
-The library will not convert to a different color space for you.
-
-To obtain raw data output, set cinfo->raw_data_out = TRUE before
-jpeg_start_decompress() (it is set FALSE by jpeg_read_header()).  Be sure to
-verify that the color space and sampling factors are ones you can handle.
-Then call jpeg_read_raw_data() in place of jpeg_read_scanlines().  The
-decompression process is otherwise the same as usual.
-
-jpeg_read_raw_data() returns one MCU row per call, and thus you must pass a
-buffer of at least max_v_samp_factor*DCTSIZE scanlines (scanline counting is
-the same as for raw-data compression).  The buffer you pass must be large
-enough to hold the actual data plus padding to DCT-block boundaries.  As with
-compression, any entirely dummy DCT blocks are not processed so you need not
-allocate space for them, but the total scanline count includes them.  The
-above example of computing buffer dimensions for raw-data compression is
-equally valid for decompression.
-
-Input suspension is supported with raw-data decompression: if the data source
-module suspends, jpeg_read_raw_data() will return 0.  You can also use
-buffered-image mode to read raw data in multiple passes.
-
-
-Really raw data: DCT coefficients
----------------------------------
-
-It is possible to read or write the contents of a JPEG file as raw DCT
-coefficients.  This facility is mainly intended for use in lossless
-transcoding between different JPEG file formats.  Other possible applications
-include lossless cropping of a JPEG image, lossless reassembly of a
-multi-strip or multi-tile TIFF/JPEG file into a single JPEG datastream, etc.
-
-To read the contents of a JPEG file as DCT coefficients, open the file and do
-jpeg_read_header() as usual.  But instead of calling jpeg_start_decompress()
-and jpeg_read_scanlines(), call jpeg_read_coefficients().  This will read the
-entire image into a set of virtual coefficient-block arrays, one array per
-component.  The return value is a pointer to an array of virtual-array
-descriptors.  Each virtual array can be accessed directly using the JPEG
-memory manager's access_virt_barray method (see Memory management, below,
-and also read structure.doc's discussion of virtual array handling).  Or,
-for simple transcoding to a different JPEG file format, the array list can
-just be handed directly to jpeg_write_coefficients().
-
-Each block in the block arrays contains quantized coefficient values in
-normal array order (not JPEG zigzag order).  The block arrays contain only
-DCT blocks containing real data; any entirely-dummy blocks added to fill out
-interleaved MCUs at the right or bottom edges of the image are discarded
-during reading and are not stored in the block arrays.  (The size of each
-block array can be determined from the width_in_blocks and height_in_blocks
-fields of the component's comp_info entry.)  This is also the data format
-expected by jpeg_write_coefficients().
-
-When you are done using the virtual arrays, call jpeg_finish_decompress()
-to release the array storage and return the decompression object to an idle
-state; or just call jpeg_destroy() if you don't need to reuse the object.
-
-If you use a suspending data source, jpeg_read_coefficients() will return
-NULL if it is forced to suspend; a non-NULL return value indicates successful
-completion.  You need not test for a NULL return value when using a
-non-suspending data source.
-
-It is also possible to call jpeg_read_coefficients() to obtain access to the
-decoder's coefficient arrays during a normal decode cycle in buffered-image
-mode.  This frammish might be useful for progressively displaying an incoming
-image and then re-encoding it without loss.  To do this, decode in buffered-
-image mode as discussed previously, then call jpeg_read_coefficients() after
-the last jpeg_finish_output() call.  The arrays will be available for your use
-until you call jpeg_finish_decompress().
-
-
-To write the contents of a JPEG file as DCT coefficients, you must provide
-the DCT coefficients stored in virtual block arrays.  You can either pass
-block arrays read from an input JPEG file by jpeg_read_coefficients(), or
-allocate virtual arrays from the JPEG compression object and fill them
-yourself.  In either case, jpeg_write_coefficients() is substituted for
-jpeg_start_compress() and jpeg_write_scanlines().  Thus the sequence is
-  * Create compression object
-  * Set all compression parameters as necessary
-  * Request virtual arrays if needed
-  * jpeg_write_coefficients()
-  * jpeg_finish_compress()
-  * Destroy or re-use compression object
-jpeg_write_coefficients() is passed a pointer to an array of virtual block
-array descriptors; the number of arrays is equal to cinfo.num_components.
-
-The virtual arrays need only have been requested, not realized, before
-jpeg_write_coefficients() is called.  A side-effect of
-jpeg_write_coefficients() is to realize any virtual arrays that have been
-requested from the compression object's memory manager.  Thus, when obtaining
-the virtual arrays from the compression object, you should fill the arrays
-after calling jpeg_write_coefficients().  The data is actually written out
-when you call jpeg_finish_compress(); jpeg_write_coefficients() only writes
-the file header.
-
-When writing raw DCT coefficients, it is crucial that the JPEG quantization
-tables and sampling factors match the way the data was encoded, or the
-resulting file will be invalid.  For transcoding from an existing JPEG file,
-we recommend using jpeg_copy_critical_parameters().  This routine initializes
-all the compression parameters to default values (like jpeg_set_defaults()),
-then copies the critical information from a source decompression object.
-The decompression object should have just been used to read the entire
-JPEG input file --- that is, it should be awaiting jpeg_finish_decompress().
-
-jpeg_write_coefficients() marks all tables stored in the compression object
-as needing to be written to the output file (thus, it acts like
-jpeg_start_compress(cinfo, TRUE)).  This is for safety's sake, to avoid
-emitting abbreviated JPEG files by accident.  If you really want to emit an
-abbreviated JPEG file, call jpeg_suppress_tables(), or set the tables'
-individual sent_table flags, between calling jpeg_write_coefficients() and
-jpeg_finish_compress().
-
-
-Progress monitoring
--------------------
-
-Some applications may need to regain control from the JPEG library every so
-often.  The typical use of this feature is to produce a percent-done bar or
-other progress display.  (For a simple example, see cjpeg.c or djpeg.c.)
-Although you do get control back frequently during the data-transferring pass
-(the jpeg_read_scanlines or jpeg_write_scanlines loop), any additional passes
-will occur inside jpeg_finish_compress or jpeg_start_decompress; those
-routines may take a long time to execute, and you don't get control back
-until they are done.
-
-You can define a progress-monitor routine which will be called periodically
-by the library.  No guarantees are made about how often this call will occur,
-so we don't recommend you use it for mouse tracking or anything like that.
-At present, a call will occur once per MCU row, scanline, or sample row
-group, whichever unit is convenient for the current processing mode; so the
-wider the image, the longer the time between calls.  During the data
-transferring pass, only one call occurs per call of jpeg_read_scanlines or
-jpeg_write_scanlines, so don't pass a large number of scanlines at once if
-you want fine resolution in the progress count.  (If you really need to use
-the callback mechanism for time-critical tasks like mouse tracking, you could
-insert additional calls inside some of the library's inner loops.)
-
-To establish a progress-monitor callback, create a struct jpeg_progress_mgr,
-fill in its progress_monitor field with a pointer to your callback routine,
-and set cinfo->progress to point to the struct.  The callback will be called
-whenever cinfo->progress is non-NULL.  (This pointer is set to NULL by
-jpeg_create_compress or jpeg_create_decompress; the library will not change
-it thereafter.  So if you allocate dynamic storage for the progress struct,
-make sure it will live as long as the JPEG object does.  Allocating from the
-JPEG memory manager with lifetime JPOOL_PERMANENT will work nicely.)  You
-can use the same callback routine for both compression and decompression.
-
-The jpeg_progress_mgr struct contains four fields which are set by the library:
-	long pass_counter;	/* work units completed in this pass */
-	long pass_limit;	/* total number of work units in this pass */
-	int completed_passes;	/* passes completed so far */
-	int total_passes;	/* total number of passes expected */
-During any one pass, pass_counter increases from 0 up to (not including)
-pass_limit; the step size is usually but not necessarily 1.  The pass_limit
-value may change from one pass to another.  The expected total number of
-passes is in total_passes, and the number of passes already completed is in
-completed_passes.  Thus the fraction of work completed may be estimated as
-		completed_passes + (pass_counter/pass_limit)
-		--------------------------------------------
-				total_passes
-ignoring the fact that the passes may not be equal amounts of work.
-
-When decompressing, pass_limit can even change within a pass, because it
-depends on the number of scans in the JPEG file, which isn't always known in
-advance.  The computed fraction-of-work-done may jump suddenly (if the library
-discovers it has overestimated the number of scans) or even decrease (in the
-opposite case).  It is not wise to put great faith in the work estimate.
-
-When using the decompressor's buffered-image mode, the progress monitor work
-estimate is likely to be completely unhelpful, because the library has no way
-to know how many output passes will be demanded of it.  Currently, the library
-sets total_passes based on the assumption that there will be one more output
-pass if the input file end hasn't yet been read (jpeg_input_complete() isn't
-TRUE), but no more output passes if the file end has been reached when the
-output pass is started.  This means that total_passes will rise as additional
-output passes are requested.  If you have a way of determining the input file
-size, estimating progress based on the fraction of the file that's been read
-will probably be more useful than using the library's value.
-
-
-Memory management
------------------
-
-This section covers some key facts about the JPEG library's built-in memory
-manager.  For more info, please read structure.doc's section about the memory
-manager, and consult the source code if necessary.
-
-All memory and temporary file allocation within the library is done via the
-memory manager.  If necessary, you can replace the "back end" of the memory
-manager to control allocation yourself (for example, if you don't want the
-library to use malloc() and free() for some reason).
-
-Some data is allocated "permanently" and will not be freed until the JPEG
-object is destroyed.  Most data is allocated "per image" and is freed by
-jpeg_finish_compress, jpeg_finish_decompress, or jpeg_abort.  You can call the
-memory manager yourself to allocate structures that will automatically be
-freed at these times.  Typical code for this is
-  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, size);
-Use JPOOL_PERMANENT to get storage that lasts as long as the JPEG object.
-Use alloc_large instead of alloc_small for anything bigger than a few Kbytes.
-There are also alloc_sarray and alloc_barray routines that automatically
-build 2-D sample or block arrays.
-
-The library's minimum space requirements to process an image depend on the
-image's width, but not on its height, because the library ordinarily works
-with "strip" buffers that are as wide as the image but just a few rows high.
-Some operating modes (eg, two-pass color quantization) require full-image
-buffers.  Such buffers are treated as "virtual arrays": only the current strip
-need be in memory, and the rest can be swapped out to a temporary file.
-
-If you use the simplest memory manager back end (jmemnobs.c), then no
-temporary files are used; virtual arrays are simply malloc()'d.  Images bigger
-than memory can be processed only if your system supports virtual memory.
-The other memory manager back ends support temporary files of various flavors
-and thus work in machines without virtual memory.  They may also be useful on
-Unix machines if you need to process images that exceed available swap space.
-
-When using temporary files, the library will make the in-memory buffers for
-its virtual arrays just big enough to stay within a "maximum memory" setting.
-Your application can set this limit by setting cinfo->mem->max_memory_to_use
-after creating the JPEG object.  (Of course, there is still a minimum size for
-the buffers, so the max-memory setting is effective only if it is bigger than
-the minimum space needed.)  If you allocate any large structures yourself, you
-must allocate them before jpeg_start_compress() or jpeg_start_decompress() in
-order to have them counted against the max memory limit.  Also keep in mind
-that space allocated with alloc_small() is ignored, on the assumption that
-it's too small to be worth worrying about; so a reasonable safety margin
-should be left when setting max_memory_to_use.
-
-If you use the jmemname.c or jmemdos.c memory manager back end, it is
-important to clean up the JPEG object properly to ensure that the temporary
-files get deleted.  (This is especially crucial with jmemdos.c, where the
-"temporary files" may be extended-memory segments; if they are not freed,
-DOS will require a reboot to recover the memory.)  Thus, with these memory
-managers, it's a good idea to provide a signal handler that will trap any
-early exit from your program.  The handler should call either jpeg_abort()
-or jpeg_destroy() for any active JPEG objects.  A handler is not needed with
-jmemnobs.c, and shouldn't be necessary with jmemansi.c or jmemmac.c either,
-since the C library is supposed to take care of deleting files made with
-tmpfile().
-
-
-Memory usage
-------------
-
-Working memory requirements while performing compression or decompression
-depend on image dimensions, image characteristics (such as colorspace and
-JPEG process), and operating mode (application-selected options).
-
-As of v6b, the decompressor requires:
- 1. About 24K in more-or-less-fixed-size data.  This varies a bit depending
-    on operating mode and image characteristics (particularly color vs.
-    grayscale), but it doesn't depend on image dimensions.
- 2. Strip buffers (of size proportional to the image width) for IDCT and
-    upsampling results.  The worst case for commonly used sampling factors
-    is about 34 bytes * width in pixels for a color image.  A grayscale image
-    only needs about 8 bytes per pixel column.
- 3. A full-image DCT coefficient buffer is needed to decode a multi-scan JPEG
-    file (including progressive JPEGs), or whenever you select buffered-image
-    mode.  This takes 2 bytes/coefficient.  At typical 2x2 sampling, that's
-    3 bytes per pixel for a color image.  Worst case (1x1 sampling) requires
-    6 bytes/pixel.  For grayscale, figure 2 bytes/pixel.
- 4. To perform 2-pass color quantization, the decompressor also needs a
-    128K color lookup table and a full-image pixel buffer (3 bytes/pixel).
-This does not count any memory allocated by the application, such as a
-buffer to hold the final output image.
-
-The above figures are valid for 8-bit JPEG data precision and a machine with
-32-bit ints.  For 12-bit JPEG data, double the size of the strip buffers and
-quantization pixel buffer.  The "fixed-size" data will be somewhat smaller
-with 16-bit ints, larger with 64-bit ints.  Also, CMYK or other unusual
-color spaces will require different amounts of space.
-
-The full-image coefficient and pixel buffers, if needed at all, do not
-have to be fully RAM resident; you can have the library use temporary
-files instead when the total memory usage would exceed a limit you set.
-(But if your OS supports virtual memory, it's probably better to just use
-jmemnobs and let the OS do the swapping.)
-
-The compressor's memory requirements are similar, except that it has no need
-for color quantization.  Also, it needs a full-image DCT coefficient buffer
-if Huffman-table optimization is asked for, even if progressive mode is not
-requested.
-
-If you need more detailed information about memory usage in a particular
-situation, you can enable the MEM_STATS code in jmemmgr.c.
-
-
-Library compile-time options
-----------------------------
-
-A number of compile-time options are available by modifying jmorecfg.h.
-
-The JPEG standard provides for both the baseline 8-bit DCT process and
-a 12-bit DCT process.  The IJG code supports 12-bit lossy JPEG if you define
-BITS_IN_JSAMPLE as 12 rather than 8.  Note that this causes JSAMPLE to be
-larger than a char, so it affects the surrounding application's image data.
-The sample applications cjpeg and djpeg can support 12-bit mode only for PPM
-and GIF file formats; you must disable the other file formats to compile a
-12-bit cjpeg or djpeg.  (install.doc has more information about that.)
-At present, a 12-bit library can handle *only* 12-bit images, not both
-precisions.  (If you need to include both 8- and 12-bit libraries in a single
-application, you could probably do it by defining NEED_SHORT_EXTERNAL_NAMES
-for just one of the copies.  You'd have to access the 8-bit and 12-bit copies
-from separate application source files.  This is untested ... if you try it,
-we'd like to hear whether it works!)
-
-Note that a 12-bit library always compresses in Huffman optimization mode,
-in order to generate valid Huffman tables.  This is necessary because our
-default Huffman tables only cover 8-bit data.  If you need to output 12-bit
-files in one pass, you'll have to supply suitable default Huffman tables.
-You may also want to supply your own DCT quantization tables; the existing
-quality-scaling code has been developed for 8-bit use, and probably doesn't
-generate especially good tables for 12-bit.
-
-The maximum number of components (color channels) in the image is determined
-by MAX_COMPONENTS.  The JPEG standard allows up to 255 components, but we
-expect that few applications will need more than four or so.
-
-On machines with unusual data type sizes, you may be able to improve
-performance or reduce memory space by tweaking the various typedefs in
-jmorecfg.h.  In particular, on some RISC CPUs, access to arrays of "short"s
-is quite slow; consider trading memory for speed by making JCOEF, INT16, and
-UINT16 be "int" or "unsigned int".  UINT8 is also a candidate to become int.
-You probably don't want to make JSAMPLE be int unless you have lots of memory
-to burn.
-
-You can reduce the size of the library by compiling out various optional
-functions.  To do this, undefine xxx_SUPPORTED symbols as necessary.
-
-You can also save a few K by not having text error messages in the library;
-the standard error message table occupies about 5Kb.  This is particularly
-reasonable for embedded applications where there's no good way to display 
-a message anyway.  To do this, remove the creation of the message table
-(jpeg_std_message_table[]) from jerror.c, and alter format_message to do
-something reasonable without it.  You could output the numeric value of the
-message code number, for example.  If you do this, you can also save a couple
-more K by modifying the TRACEMSn() macros in jerror.h to expand to nothing;
-you don't need trace capability anyway, right?
-
-
-Portability considerations
---------------------------
-
-The JPEG library has been written to be extremely portable; the sample
-applications cjpeg and djpeg are slightly less so.  This section summarizes
-the design goals in this area.  (If you encounter any bugs that cause the
-library to be less portable than is claimed here, we'd appreciate hearing
-about them.)
-
-The code works fine on ANSI C, C++, and pre-ANSI C compilers, using any of
-the popular system include file setups, and some not-so-popular ones too.
-See install.doc for configuration procedures.
-
-The code is not dependent on the exact sizes of the C data types.  As
-distributed, we make the assumptions that
-	char	is at least 8 bits wide
-	short	is at least 16 bits wide
-	int	is at least 16 bits wide
-	long	is at least 32 bits wide
-(These are the minimum requirements of the ANSI C standard.)  Wider types will
-work fine, although memory may be used inefficiently if char is much larger
-than 8 bits or short is much bigger than 16 bits.  The code should work
-equally well with 16- or 32-bit ints.
-
-In a system where these assumptions are not met, you may be able to make the
-code work by modifying the typedefs in jmorecfg.h.  However, you will probably
-have difficulty if int is less than 16 bits wide, since references to plain
-int abound in the code.
-
-char can be either signed or unsigned, although the code runs faster if an
-unsigned char type is available.  If char is wider than 8 bits, you will need
-to redefine JOCTET and/or provide custom data source/destination managers so
-that JOCTET represents exactly 8 bits of data on external storage.
-
-The JPEG library proper does not assume ASCII representation of characters.
-But some of the image file I/O modules in cjpeg/djpeg do have ASCII
-dependencies in file-header manipulation; so does cjpeg's select_file_type()
-routine.
-
-The JPEG library does not rely heavily on the C library.  In particular, C
-stdio is used only by the data source/destination modules and the error
-handler, all of which are application-replaceable.  (cjpeg/djpeg are more
-heavily dependent on stdio.)  malloc and free are called only from the memory
-manager "back end" module, so you can use a different memory allocator by
-replacing that one file.
-
-The code generally assumes that C names must be unique in the first 15
-characters.  However, global function names can be made unique in the
-first 6 characters by defining NEED_SHORT_EXTERNAL_NAMES.
-
-More info about porting the code may be gleaned by reading jconfig.doc,
-jmorecfg.h, and jinclude.h.
-
-
-Notes for MS-DOS implementors
------------------------------
-
-The IJG code is designed to work efficiently in 80x86 "small" or "medium"
-memory models (i.e., data pointers are 16 bits unless explicitly declared
-"far"; code pointers can be either size).  You may be able to use small
-model to compile cjpeg or djpeg by itself, but you will probably have to use
-medium model for any larger application.  This won't make much difference in
-performance.  You *will* take a noticeable performance hit if you use a
-large-data memory model (perhaps 10%-25%), and you should avoid "huge" model
-if at all possible.
-
-The JPEG library typically needs 2Kb-3Kb of stack space.  It will also
-malloc about 20K-30K of near heap space while executing (and lots of far
-heap, but that doesn't count in this calculation).  This figure will vary
-depending on selected operating mode, and to a lesser extent on image size.
-There is also about 5Kb-6Kb of constant data which will be allocated in the
-near data segment (about 4Kb of this is the error message table).
-Thus you have perhaps 20K available for other modules' static data and near
-heap space before you need to go to a larger memory model.  The C library's
-static data will account for several K of this, but that still leaves a good
-deal for your needs.  (If you are tight on space, you could reduce the sizes
-of the I/O buffers allocated by jdatasrc.c and jdatadst.c, say from 4K to
-1K.  Another possibility is to move the error message table to far memory;
-this should be doable with only localized hacking on jerror.c.)
-
-About 2K of the near heap space is "permanent" memory that will not be
-released until you destroy the JPEG object.  This is only an issue if you
-save a JPEG object between compression or decompression operations.
-
-Far data space may also be a tight resource when you are dealing with large
-images.  The most memory-intensive case is decompression with two-pass color
-quantization, or single-pass quantization to an externally supplied color
-map.  This requires a 128Kb color lookup table plus strip buffers amounting
-to about 40 bytes per column for typical sampling ratios (eg, about 25600
-bytes for a 640-pixel-wide image).  You may not be able to process wide
-images if you have large data structures of your own.
-
-Of course, all of these concerns vanish if you use a 32-bit flat-memory-model
-compiler, such as DJGPP or Watcom C.  We highly recommend flat model if you
-can use it; the JPEG library is significantly faster in flat model.
diff --git a/jpeg/makefile.gen b/jpeg/makefile.gen
deleted file mode 100644
index 6692479e045a..000000000000
--- a/jpeg/makefile.gen
+++ /dev/null
@@ -1,274 +0,0 @@
-# Generated automatically from makefile.cfg by configure.
-# Makefile for Independent JPEG Group's software
-
-# makefile.cfg is edited by configure to produce a custom Makefile.
-
-# Read installation instructions before saying "make" !!
-
-# For compiling with source and object files in different directories.
-srcdir = $(VPATH)
-
-# Where to install the programs and man pages.
-prefix = /usr/local
-exec_prefix = ${prefix}
-bindir = $(exec_prefix)/bin
-libdir = $(exec_prefix)/lib
-includedir = $(prefix)/include
-binprefix =
-manprefix =
-manext = 1
-mandir = $(prefix)/man/man$(manext)
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-CFLAGS= -O3  -I$(srcdir)
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-# However, any special defines for ansi2knr.c may be included here:
-ANSI2KNRFLAGS= 
-
-# Link-time cc options:
-LDFLAGS= 
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= 
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-SHELL= /bin/sh
-# linker
-LN= $(CC)
-# file deletion command
-RM= rm -f
-# file rename command
-MV= mv
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-# installation program
-INSTALL= cp
-INSTALL_PROGRAM= ${INSTALL}
-INSTALL_DATA= ${INSTALL}
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c jmemansi.c jmemname.c jmemnobs.c \
-        jmemdos.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c cdjpeg.c rdcolmap.c rdswitch.c \
-        rdjpgcom.c wrjpgcom.c rdppm.c wrppm.c rdgif.c wrgif.c rdtarga.c \
-        wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makcjpeg.st makdjpeg.st \
-        makljpeg.st maktjpeg.st makefile.manx makefile.sas makefile.mms \
-        makefile.vms makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.manx jconfig.sas jconfig.st jconfig.bcc \
-        jconfig.mc6 jconfig.dj jconfig.wat jconfig.vms
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.gif testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
-        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdatasrc.o jdmaster.o \
-        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
-        cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
-        cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o
-
-all: libjpeg.a
-
-realall:  libjpeg.a cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-# This rule causes ansi2knr to be invoked.
-# .c.o:
-# 	./ansi2knr $(srcdir)/$*.c T$*.c
-# 	$(CC) $(CFLAGS) -c T$*.c
-# 	$(RM) T$*.c $*.o
-# 	$(MV) T$*.o $*.o
-
-ansi2knr: ansi2knr.c
-	$(CC) $(CFLAGS) $(ANSI2KNRFLAGS) -o ansi2knr ansi2knr.c
-
-# Decompression-only library
-libjpeg.a:  $(DLIBOBJECTS) $(COMOBJECTS)
-	$(RM) libjpeg.a
-	$(AR) libjpeg.a  $(DLIBOBJECTS) $(COMOBJECTS)
-	$(AR2) libjpeg.a
-
-cjpeg: $(COBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
-	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
-	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-install: cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-	$(INSTALL_PROGRAM) cjpeg $(bindir)/$(binprefix)cjpeg
-	$(INSTALL_PROGRAM) djpeg $(bindir)/$(binprefix)djpeg
-	$(INSTALL_PROGRAM) jpegtran $(bindir)/$(binprefix)jpegtran
-	$(INSTALL_PROGRAM) rdjpgcom $(bindir)/$(binprefix)rdjpgcom
-	$(INSTALL_PROGRAM) wrjpgcom $(bindir)/$(binprefix)wrjpgcom
-	$(INSTALL_DATA) $(srcdir)/cjpeg.1 $(mandir)/$(manprefix)cjpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/djpeg.1 $(mandir)/$(manprefix)djpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
-	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
-	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
-
-install-lib: libjpeg.a install-headers
-	$(INSTALL_DATA) libjpeg.a $(libdir)/$(binprefix)libjpeg.a
-
-install-headers: jconfig.h
-	$(INSTALL_DATA) jconfig.h $(includedir)/jconfig.h
-	$(INSTALL_DATA) $(srcdir)/jpeglib.h $(includedir)/jpeglib.h
-	$(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
-	$(INSTALL_DATA) $(srcdir)/jerror.h $(includedir)/jerror.h
-
-clean:
-	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout* config.log config.status
-
-distribute:
-	$(RM) jpegsrc.tar*
-	tar cvf jpegsrc.tar $(DISTFILES)
-	compress -v jpegsrc.tar
-
-test: cjpeg djpeg jpegtran
-	$(RM) testout*
-	./djpeg -dct int -ppm -outfile testout.ppm  $(srcdir)/testorig.jpg
-	./djpeg -dct int -gif -outfile testout.gif  $(srcdir)/testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  $(srcdir)/testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
-	./jpegtran -outfile testoutt.jpg $(srcdir)/testprog.jpg
-	cmp $(srcdir)/testimg.ppm testout.ppm
-	cmp $(srcdir)/testimg.gif testout.gif
-	cmp $(srcdir)/testimg.jpg testout.jpg
-	cmp $(srcdir)/testimg.ppm testoutp.ppm
-	cmp $(srcdir)/testimgp.jpg testoutp.jpg
-	cmp $(srcdir)/testorig.jpg testoutt.jpg
-
-check: test
-
-# GNU Make likes to know which target names are not really files to be made:
-.PHONY: all install install-lib install-headers clean distribute test check
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/jpeg/netscape_mods.doc b/jpeg/netscape_mods.doc
deleted file mode 100644
index df40579a6000..000000000000
--- a/jpeg/netscape_mods.doc
+++ /dev/null
@@ -1,52 +0,0 @@
-***** BEGIN LICENSE BLOCK *****
-Version: MPL 1.1/GPL 2.0/LGPL 2.1
-
-The contents of this file are subject to the Mozilla Public License Version 
-1.1 (the "License"); you may not use this file except in compliance with 
-the License. You may obtain a copy of the License at 
-http://www.mozilla.org/MPL/
-
-Software distributed under the License is distributed on an "AS IS" basis,
-WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
-for the specific language governing rights and limitations under the
-License.
-
-The Original Code is mozilla.org code.
-
-The Initial Developer of the Original Code is
-Netscape Communications Corporation
-Portions created by the Initial Developer are Copyright (C) 1998
-the Initial Developer. All Rights Reserved.
-
-Contributor(s):
-
-Alternatively, the contents of this file may be used under the terms of
-either the GNU General Public License Version 2 or later (the "GPL"), or
-the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
-in which case the provisions of the GPL or the LGPL are applicable instead
-of those above. If you wish to allow use of your version of this file only
-under the terms of either the GPL or the LGPL, and not to allow others to
-use your version of this file under the terms of the MPL, indicate your
-decision by deleting the provisions above and replace them with the notice
-and other provisions required by the GPL or the LGPL. If you do not delete
-the provisions above, a recipient may use your version of this file under
-the terms of any one of the MPL, the GPL or the LGPL.
-
-***** END LICENSE BLOCK *****
-
-This directory contains a subset of the IJG JPEG library.  Among other
-omissions, most of the original IJG documentation has been deleted.
-You can find the full IJG distribution at the archive sites mentioned in
-the README file.  Please note that the IJG code does not fall under the
-Netscape NPL, but is freely distributable under its own copyright terms
-(see README).
-
-Several files have been modified to allow incorporation of the IJG code
-into the Netscape environment.  As of this writing, jconfig.h, jmorecfg.h,
-and jerror.c contain Netscape-specific changes.  In addition, we have created
-our own makefiles following Netscape conventions, rather than using any of
-those provided by IJG.
-
-There are some other changes herein, such as MMX-specific optimizations,
-which should eventually make their way back into the standard IJG
-distribution.
diff --git a/jpeg/Makefile.in b/jpeg/simd/Makefile.in
similarity index 56%
copy from jpeg/Makefile.in
copy to jpeg/simd/Makefile.in
index 74ee17785299..04f1d47071ea 100644
--- a/jpeg/Makefile.in
+++ b/jpeg/simd/Makefile.in
@@ -15,11 +15,12 @@
 # The Original Code is mozilla.org code.
 #
 # The Initial Developer of the Original Code is
-# Netscape Communications Corporation.
-# Portions created by the Initial Developer are Copyright (C) 1998
+# Mozilla Corporation
+# Portions created by the Initial Developer are Copyright (C) 2010
 # the Initial Developer. All Rights Reserved.
 #
 # Contributor(s):
+#  Justin Lebar <justin.lebar@gmail.com>
 #
 # Alternatively, the contents of this file may be used under the terms of
 # either the GNU General Public License Version 2 or later (the "GPL"), or
@@ -35,90 +36,13 @@
 #
 # ***** END LICENSE BLOCK *****
 
-DEPTH		= ..
+DEPTH		= ../..
 topsrcdir	= @top_srcdir@
 srcdir		= @srcdir@
 VPATH		= @srcdir@
 
 include $(DEPTH)/config/autoconf.mk
 
-MODULE		= jpeg
-LIBRARY_NAME	= mozjpeg
-
-ifeq ($(OS_ARCH),WINNT)
-LIBRARY_NAME	= jpeg32$(VERSION_NUMBER)
-ifneq ($(OS_TEST),x86_64)
-# FIXME: bug 413019
-ifndef GNU_CC
-OS_COMPILE_CFLAGS += -GL-
-endif
-endif
-endif
-
-GRE_MODULE	= 1
-
-CSRCS		= \
-		jdapimin.c \
-		jdapistd.c \
-		jdatasrc.c \
-		jdatadst.c \
-		jdmaster.c \
-		jdinput.c \
-		jdmarker.c \
-		jdhuff.c \
-		jdphuff.c \
-		jdmainct.c \
-		jdcoefct.c \
-		jdpostct.c \
-		jddctmgr.c \
-		jidctfst.c \
-		jidctflt.c \
-		jidctint.c \
-		jdsample.c \
-		jdcolor.c \
-		jquant1.c \
-		jquant2.c \
-		jdmerge.c \
-		jcomapi.c \
-		jutils.c \
-		jerror.c \
-		jmemmgr.c \
-		jmemnobs.c \
-		jfdctflt.c \
-		jfdctfst.c \
-		jfdctint.c \
-		$(NULL)
-
-EXPORTS		= \
-		jconfig.h \
-		jerror.h \
-		jinclude.h \
-		jmorecfg.h \
-		jpeglib.h \
-		jpegint.h \
-		jwinfig.h \
-		jos2fig.h \
-		$(NULL)
-
-# These files enable support for writing JPEGs
-CSRCS		+= \
-		jcapimin.c \
-		jcparam.c \
-		jcapistd.c \
-		jcmarker.c \
-		jcinit.c \
-		jcmainct.c \
-		jchuff.c \
-		jcsample.c \
-		jcmaster.c \
-		jccoefct.c \
-		jccolor.c \
-		jcphuff.c \
-		jcdctmgr.c \
-		jcprepct.c \
-		$(NULL)
-
-# need static lib for some of the libimg componentry to link properly
-FORCE_STATIC_LIB = 1
+# empty makefile so this directory gets created in the objdir.
 
 include $(topsrcdir)/config/rules.mk
diff --git a/jpeg/simd/jcclrmmx.asm b/jpeg/simd/jcclrmmx.asm
new file mode 100644
index 000000000000..b6b89121b5fe
--- /dev/null
+++ b/jpeg/simd/jcclrmmx.asm
@@ -0,0 +1,479 @@
+;
+; jcclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
+;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                           JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	xor	eax,eax
+	mov	al, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	xor	edx,edx
+	mov	dx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	mmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	mmG, DWORD [esi+ecx]
+	psllq	mmA, DWORD_BIT
+	por	mmA,mmG
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	movq	mmG,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	mov	ecx, SIZEOF_MMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld16:
+	test	cl, 2*SIZEOF_MMWORD
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 01 11 21 02 12)
+	; mmG=(22 03 13 23 04 14 24 05)
+	; mmF=(15 25 06 16 26 07 17 27)
+
+	movq      mmD,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
+	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
+
+	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
+	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
+
+	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
+	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
+
+	movq      mmE,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
+	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
+
+	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
+
+	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
+	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
+
+	pxor      mmH,mmH
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
+
+	movq      mmB,mmE
+	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
+	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
+
+	movq      mmF,mmD
+	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
+	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_MMWORD/8
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_MMWORD/8
+	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_MMWORD/4
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_MMWORD/4
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+	test	cl, SIZEOF_MMWORD/2
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmD,mmA
+	movq	mmC,mmF
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 30 01 11 21 31)
+	; mmF=(02 12 22 32 03 13 23 33)
+	; mmD=(04 14 24 34 05 15 25 35)
+	; mmC=(06 16 26 36 07 17 27 37)
+
+	movq      mmB,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
+	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
+
+	movq      mmG,mmD
+	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
+	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
+
+	movq      mmE,mmA
+	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
+
+	movq      mmH,mmB
+	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
+	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
+
+	pxor      mmF,mmF
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
+
+	movq      mmD,mmB
+	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
+	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
+
+	movq      mmG,mmE
+	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
+	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
+
+	punpcklbw mmF,mmH
+	punpckhbw mmH,mmH
+	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
+	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
+	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
+	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
+	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
+
+	movq      mm6,mm1
+	punpcklwd mm1,mm3
+	punpckhwd mm6,mm3
+	movq      mm7,mm1
+	movq      mm4,mm6
+	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      mm1,mm1
+	pxor      mm6,mm6
+	punpcklwd mm1,mm5		; mm1=BOL
+	punpckhwd mm6,mm5		; mm6=BOH
+	psrld     mm1,1			; mm1=BOL*FIX(0.500)
+	psrld     mm6,1			; mm6=BOH*FIX(0.500)
+
+	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm1
+	paddd     mm4,mm6
+	paddd     mm7,mm5
+	paddd     mm4,mm5
+	psrld     mm7,SCALEBITS		; mm7=CbOL
+	psrld     mm4,SCALEBITS		; mm4=CbOH
+	packssdw  mm7,mm4		; mm7=CbO
+
+	movq      mm1, MMWORD [wk(2)]	; mm1=BE
+
+	movq      mm6,mm0
+	punpcklwd mm0,mm2
+	punpckhwd mm6,mm2
+	movq      mm5,mm0
+	movq      mm4,mm6
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      mm0,mm0
+	pxor      mm6,mm6
+	punpcklwd mm0,mm1		; mm0=BEL
+	punpckhwd mm6,mm1		; mm6=BEH
+	psrld     mm0,1			; mm0=BEL*FIX(0.500)
+	psrld     mm6,1			; mm6=BEH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm5,mm0
+	paddd     mm4,mm6
+	paddd     mm5,mm1
+	paddd     mm4,mm1
+	psrld     mm5,SCALEBITS		; mm5=CbEL
+	psrld     mm4,SCALEBITS		; mm4=CbEH
+	packssdw  mm5,mm4		; mm5=CbE
+
+	psllw     mm7,BYTE_BIT
+	por       mm5,mm7		; mm5=Cb
+	movq      MMWORD [ebx], mm5	; Save Cb
+
+	movq      mm0, MMWORD [wk(3)]	; mm0=BO
+	movq      mm6, MMWORD [wk(2)]	; mm6=BE
+	movq      mm1, MMWORD [wk(1)]	; mm1=RO
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm3
+	punpckhwd mm4,mm3
+	movq      mm7,mm0
+	movq      mm5,mm4
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
+
+	paddd     mm0, MMWORD [wk(4)]
+	paddd     mm4, MMWORD [wk(5)]
+	paddd     mm0,mm3
+	paddd     mm4,mm3
+	psrld     mm0,SCALEBITS		; mm0=YOL
+	psrld     mm4,SCALEBITS		; mm4=YOH
+	packssdw  mm0,mm4		; mm0=YO
+
+	pxor      mm3,mm3
+	pxor      mm4,mm4
+	punpcklwd mm3,mm1		; mm3=ROL
+	punpckhwd mm4,mm1		; mm4=ROH
+	psrld     mm3,1			; mm3=ROL*FIX(0.500)
+	psrld     mm4,1			; mm4=ROH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm3
+	paddd     mm5,mm4
+	paddd     mm7,mm1
+	paddd     mm5,mm1
+	psrld     mm7,SCALEBITS		; mm7=CrOL
+	psrld     mm5,SCALEBITS		; mm5=CrOH
+	packssdw  mm7,mm5		; mm7=CrO
+
+	movq      mm3, MMWORD [wk(0)]	; mm3=RE
+
+	movq      mm4,mm6
+	punpcklwd mm6,mm2
+	punpckhwd mm4,mm2
+	movq      mm1,mm6
+	movq      mm5,mm4
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
+
+	paddd     mm6, MMWORD [wk(6)]
+	paddd     mm4, MMWORD [wk(7)]
+	paddd     mm6,mm2
+	paddd     mm4,mm2
+	psrld     mm6,SCALEBITS		; mm6=YEL
+	psrld     mm4,SCALEBITS		; mm4=YEH
+	packssdw  mm6,mm4		; mm6=YE
+
+	psllw     mm0,BYTE_BIT
+	por       mm6,mm0		; mm6=Y
+	movq      MMWORD [edi], mm6	; Save Y
+
+	pxor      mm2,mm2
+	pxor      mm4,mm4
+	punpcklwd mm2,mm3		; mm2=REL
+	punpckhwd mm4,mm3		; mm4=REH
+	psrld     mm2,1			; mm2=REL*FIX(0.500)
+	psrld     mm4,1			; mm4=REH*FIX(0.500)
+
+	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+	paddd     mm1,mm2
+	paddd     mm5,mm4
+	paddd     mm1,mm0
+	paddd     mm5,mm0
+	psrld     mm1,SCALEBITS		; mm1=CrEL
+	psrld     mm5,SCALEBITS		; mm5=CrEH
+	packssdw  mm1,mm5		; mm1=CrE
+
+	psllw     mm7,BYTE_BIT
+	por       mm1,mm7		; mm1=Cr
+	movq      MMWORD [edx], mm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
+	add	edi, byte SIZEOF_MMWORD			; outptr0
+	add	ebx, byte SIZEOF_MMWORD			; outptr1
+	add	edx, byte SIZEOF_MMWORD			; outptr2
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcclrss2-64.asm b/jpeg/simd/jcclrss2-64.asm
new file mode 100644
index 000000000000..8ca47aa06cfe
--- /dev/null
+++ b/jpeg/simd/jcclrss2-64.asm
@@ -0,0 +1,487 @@
+;
+; jcclrss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+
+	align	16
+
+	global	EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+	push	rbx
+
+	mov	rcx, r10
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov rsi, r12
+	mov rcx, r13
+	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+	pop	rcx
+
+	mov rsi, r11
+	mov	eax, r14d
+	test	rax,rax
+	jle	near .return
+.rowloop:
+	push	rdx
+	push	rbx
+	push	rdi
+	push	rsi
+	push	rcx			; col
+
+	mov	rsi, JSAMPROW [rsi]	; inptr
+	mov	rdi, JSAMPROW [rdi]	; outptr0
+	mov	rbx, JSAMPROW [rbx]	; outptr1
+	mov	rdx, JSAMPROW [rdx]	; outptr2
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	rax
+	push	rdx
+	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_BYTE
+	movzx	rax, BYTE [rsi+rcx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_WORD
+	movzx	rdx, WORD [rsi+rcx]
+	shl	rax, WORD_BIT
+	or	rax,rdx
+.column_ld4:
+	movd	xmmA,eax
+	pop	rdx
+	pop	rax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	rcx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [rbx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [rdi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [rdx], xmm1	; Save Cr
+
+	sub	rcx, byte SIZEOF_XMMWORD
+	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte SIZEOF_XMMWORD		; outptr0
+	add	rbx, byte SIZEOF_XMMWORD		; outptr1
+	add	rdx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	rcx,rcx
+	jnz	near .column_ld1
+
+	pop	rcx			; col
+	pop	rsi
+	pop	rdi
+	pop	rbx
+	pop	rdx
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
+	add	rdi, byte SIZEOF_JSAMPROW
+	add	rbx, byte SIZEOF_JSAMPROW
+	add	rdx, byte SIZEOF_JSAMPROW
+	dec	rax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcclrss2.asm b/jpeg/simd/jcclrss2.asm
new file mode 100644
index 000000000000..8def718cb172
--- /dev/null
+++ b/jpeg/simd/jcclrss2.asm
@@ -0,0 +1,505 @@
+;
+; jcclrss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+
+	global	EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	movzx	eax, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	movzx	edx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	xmmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	ecx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [ebx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [edi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [edx], xmm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_XMMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	edi, byte SIZEOF_XMMWORD		; outptr0
+	add	ebx, byte SIZEOF_XMMWORD		; outptr1
+	add	edx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jccolmmx.asm b/jpeg/simd/jccolmmx.asm
new file mode 100644
index 000000000000..5e7f3be994ee
--- /dev/null
+++ b/jpeg/simd/jccolmmx.asm
@@ -0,0 +1,120 @@
+;
+; jccolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
diff --git a/jpeg/simd/jccolss2-64.asm b/jpeg/simd/jccolss2-64.asm
new file mode 100644
index 000000000000..64ee0ba85d51
--- /dev/null
+++ b/jpeg/simd/jccolss2-64.asm
@@ -0,0 +1,117 @@
+;
+; jccolss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
diff --git a/jpeg/simd/jccolss2.asm b/jpeg/simd/jccolss2.asm
new file mode 100644
index 000000000000..8d1f73406e8a
--- /dev/null
+++ b/jpeg/simd/jccolss2.asm
@@ -0,0 +1,117 @@
+;
+; jccolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
diff --git a/jpeg/simd/jcolsamp.inc b/jpeg/simd/jcolsamp.inc
new file mode 100644
index 000000000000..79751b7c7282
--- /dev/null
+++ b/jpeg/simd/jcolsamp.inc
@@ -0,0 +1,105 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define  mmA  mm0
+%define  mmB  mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%elif RGB_GREEN == 0
+%define  mmA  mm2
+%define  mmB  mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%elif RGB_BLUE == 0
+%define  mmA  mm4
+%define  mmB  mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%else
+%define  mmA  mm6
+%define  mmB  mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%endif
+
+%if RGB_RED == 1
+%define  mmC  mm0
+%define  mmD  mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%elif RGB_GREEN == 1
+%define  mmC  mm2
+%define  mmD  mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%elif RGB_BLUE == 1
+%define  mmC  mm4
+%define  mmD  mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%else
+%define  mmC  mm6
+%define  mmD  mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%endif
+
+%if RGB_RED == 2
+%define  mmE  mm0
+%define  mmF  mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%elif RGB_GREEN == 2
+%define  mmE  mm2
+%define  mmF  mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%elif RGB_BLUE == 2
+%define  mmE  mm4
+%define  mmF  mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%else
+%define  mmE  mm6
+%define  mmF  mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%endif
+
+%if RGB_RED == 3
+%define  mmG  mm0
+%define  mmH  mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%elif RGB_GREEN == 3
+%define  mmG  mm2
+%define  mmH  mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%elif RGB_BLUE == 3
+%define  mmG  mm4
+%define  mmH  mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%else
+%define  mmG  mm6
+%define  mmH  mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/jpeg/simd/jcqnt3dn.asm b/jpeg/simd/jcqnt3dn.asm
new file mode 100644
index 000000000000..182c8695276d
--- /dev/null
+++ b/jpeg/simd/jcqnt3dn.asm
@@ -0,0 +1,233 @@
+;
+; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                             FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad	mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	pi2fd	mm4,mm4
+	pi2fd	mm2,mm2
+	psrad	mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad	mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	pi2fd	mm5,mm5
+	pi2fd	mm0,mm0
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad	mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad	mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	pi2fd	mm6,mm6
+	pi2fd	mm3,mm3
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad	mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	pi2fd	mm4,mm4
+	pi2fd	mm1,mm1
+
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                             FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov       eax, 0x4B400000	; (float)0x00C00000 (rndint_magic)
+	movd      mm7,eax
+	punpckldq mm7,mm7		; mm7={12582912.0F 12582912.0F}
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm0,mm7			; mm0=(00 ** 01 **)
+	pfadd	mm1,mm7			; mm1=(02 ** 03 **)
+	pfadd	mm2,mm7			; mm0=(04 ** 05 **)
+	pfadd	mm3,mm7			; mm1=(06 ** 07 **)
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm1		; mm0=(00 02 ** **)
+	punpckhwd mm4,mm1		; mm4=(01 03 ** **)
+	movq      mm5,mm2
+	punpcklwd mm2,mm3		; mm2=(04 06 ** **)
+	punpckhwd mm5,mm3		; mm5=(05 07 ** **)
+
+	punpcklwd mm0,mm4		; mm0=(00 01 02 03)
+	punpcklwd mm2,mm5		; mm2=(04 05 06 07)
+
+	movq	mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm6,mm7			; mm0=(10 ** 11 **)
+	pfadd	mm1,mm7			; mm4=(12 ** 13 **)
+	pfadd	mm3,mm7			; mm0=(14 ** 15 **)
+	pfadd	mm4,mm7			; mm4=(16 ** 17 **)
+
+	movq      mm5,mm6
+	punpcklwd mm6,mm1		; mm6=(10 12 ** **)
+	punpckhwd mm5,mm1		; mm5=(11 13 ** **)
+	movq      mm1,mm3
+	punpcklwd mm3,mm4		; mm3=(14 16 ** **)
+	punpckhwd mm1,mm4		; mm1=(15 17 ** **)
+
+	punpcklwd mm6,mm5		; mm6=(10 11 12 13)
+	punpcklwd mm3,mm1		; mm3=(14 15 16 17)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcqntmmx.asm b/jpeg/simd/jcqntmmx.asm
new file mode 100644
index 000000000000..08b08b79e216
--- /dev/null
+++ b/jpeg/simd/jcqntmmx.asm
@@ -0,0 +1,274 @@
+;
+; jcqntmmx.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	mm6,mm6			; mm6=(all 0's)
+	pcmpeqw	mm7,mm7
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
+	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
+
+	movq      mm4,mm0
+	punpcklbw mm0,mm6		; mm0=(0123)
+	punpckhbw mm4,mm6		; mm4=(4567)
+	movq      mm5,mm1
+	punpcklbw mm1,mm6		; mm1=(89AB)
+	punpckhbw mm5,mm6		; mm5=(CDEF)
+
+	paddw	mm0,mm7
+	paddw	mm4,mm7
+	paddw	mm1,mm7
+	paddw	mm5,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+	movq      mm0,mm2
+	punpcklbw mm2,mm6		; mm2=(GHIJ)
+	punpckhbw mm0,mm6		; mm0=(KLMN)
+	movq      mm4,mm3
+	punpcklbw mm3,mm6		; mm3=(OPQR)
+	punpckhbw mm4,mm6		; mm4=(STUV)
+
+	paddw	mm2,mm7
+	paddw	mm0,mm7
+	paddw	mm3,mm7
+	paddw	mm4,mm7
+
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
+;                     DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ah, 2
+	alignx	16,7
+.quantloop1:
+	mov	al, DCTSIZE2/8/2
+	alignx	16,7
+.quantloop2:
+	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+	movq	mm0,mm2
+	movq	mm1,mm3
+
+	psraw	mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
+	psraw	mm3,(WORD_BIT-1)
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	;
+	; MMX is an annoyingly crappy instruction set. It has two
+	; misfeatures that are causing problems here:
+	;
+	; - All multiplications are signed.
+	;
+	; - The second operand for the shifts is not treated as packed.
+	;
+	;
+	; We work around the first problem by implementing this algorithm:
+	;
+	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+	; {
+	;   enum { SHORT_BIT = 16 };
+	;   signed short sx = (signed short) x;
+	;   signed short sy = (signed short) y;
+	;   signed long sz;
+	; 
+	;   sz = (long) sx * (long) sy;     /* signed multiply */
+	; 
+	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
+	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
+	; 
+	;   return (unsigned long) sz;
+	; }
+	;
+	; (note that a negative sx adds _sy_ and vice versa)
+	;
+	; For the second problem, we replace the shift by a multiplication.
+	; Unfortunately that means we have to deal with the signed issue again.
+	;
+
+	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
+	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
+
+	movq	mm4,mm0   ; store current value for later
+	movq	mm5,mm1
+	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
+	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
+	paddw	mm0,mm4		; reciprocal is always negative (MSB=1),
+	paddw	mm1,mm5   ; so we always need to add the initial value
+	                ; (input value is never negative as we
+	                ; inverted it at the start of this routine)
+
+	; here it gets a bit tricky as both scale
+	; and mm0/mm1 can be negative
+	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
+	movq	mm7, MMWORD [SCALE(0,1,edx)]
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0,mm6
+	pmulhw	mm1,mm7
+
+	psraw	mm6,(WORD_BIT-1)    ; determine if scale is negative
+	psraw	mm7,(WORD_BIT-1)
+
+	pand	mm6,mm4             ; and add input if it is
+	pand	mm7,mm5
+	paddw	mm0,mm6
+	paddw	mm1,mm7
+
+	psraw	mm4,(WORD_BIT-1)    ; then check if negative input 
+	psraw	mm5,(WORD_BIT-1)
+
+	pand	mm4, MMWORD [SCALE(0,0,edx)]	; and add scale if it is
+	pand	mm5, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+	add	esi, byte 8*SIZEOF_DCTELEM
+	add	edx, byte 8*SIZEOF_DCTELEM
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	al
+	jnz	near .quantloop2
+	dec	ah
+	jnz	near .quantloop1	; to avoid branch misprediction
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcqnts2f-64.asm b/jpeg/simd/jcqnts2f-64.asm
new file mode 100644
index 000000000000..d0efa1b909f6
--- /dev/null
+++ b/jpeg/simd/jcqnts2f-64.asm
@@ -0,0 +1,158 @@
+;
+; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+	push	rbx
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov rsi, r10
+	mov	rax, r11
+	mov rdi, r12
+	mov	rcx, DCTSIZE/2
+.convloop:
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	rsi, byte 2*SIZEOF_JSAMPROW
+	add	rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	rcx
+	jnz	short .convloop
+
+	pop	rbx
+	uncollect_args
+	pop	rbp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT * divisors
+; r12 = FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov rsi, r12
+	mov rdx, r11
+	mov rdi, r10
+	mov	rax, DCTSIZE2/16
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+	add	rsi, byte 16*SIZEOF_FAST_FLOAT
+	add	rdx, byte 16*SIZEOF_FAST_FLOAT
+	add	rdi, byte 16*SIZEOF_JCOEF
+	dec	rax
+	jnz	short .quantloop
+
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcqnts2f.asm b/jpeg/simd/jcqnts2f.asm
new file mode 100644
index 000000000000..d80ae5dc9986
--- /dev/null
+++ b/jpeg/simd/jcqnts2f.asm
@@ -0,0 +1,171 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcqnts2i-64.asm b/jpeg/simd/jcqnts2i-64.asm
new file mode 100644
index 000000000000..cc33d59f9981
--- /dev/null
+++ b/jpeg/simd/jcqnts2i-64.asm
@@ -0,0 +1,187 @@
+;
+; jcqnts2i-64.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+	push	rbx
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov rsi, r10
+	mov rax, r11
+	mov rdi, r12
+	mov	rcx, DCTSIZE/4
+.convloop:
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+	add	rsi, byte 4*SIZEOF_JSAMPROW
+	add	rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	rcx
+	jnz	short .convloop
+
+	pop	rbx
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                      DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM * divisors
+; r12 = DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov rsi, r12
+	mov rdx, r11
+	mov rdi, r10
+	mov	rax, DCTSIZE2/32
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,rdx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,rdx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,rdx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+	add	rsi, byte 32*SIZEOF_DCTELEM
+	add	rdx, byte 32*SIZEOF_DCTELEM
+	add	rdi, byte 32*SIZEOF_JCOEF
+	dec	rax
+	jnz	near .quantloop
+
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcqnts2i.asm b/jpeg/simd/jcqnts2i.asm
new file mode 100644
index 000000000000..0864d6ed4ac8
--- /dev/null
+++ b/jpeg/simd/jcqnts2i.asm
@@ -0,0 +1,200 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                      DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/32
+	alignx	16,7
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,edx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,edx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,edx)]
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,edx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,edx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 32*SIZEOF_DCTELEM
+	add	edx, byte 32*SIZEOF_DCTELEM
+	add	edi, byte 32*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcqntsse.asm b/jpeg/simd/jcqntsse.asm
new file mode 100644
index 000000000000..3065eca8123b
--- /dev/null
+++ b/jpeg/simd/jcqntsse.asm
@@ -0,0 +1,211 @@
+;
+; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
+	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
+	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
+	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
+	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
+	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
+
+	movlhps   xmm0,xmm1			; xmm0=(0123)
+	movlhps   xmm2,xmm3			; xmm2=(4567)
+	movlhps   xmm4,xmm5			; xmm4=(89AB)
+	movlhps   xmm6,xmm7			; xmm6=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                           FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	movhlps  xmm4,xmm0
+	movhlps  xmm5,xmm1
+
+	cvtps2pi mm0,xmm0
+	cvtps2pi mm1,xmm1
+	cvtps2pi mm4,xmm4
+	cvtps2pi mm5,xmm5
+
+	movhlps  xmm6,xmm2
+	movhlps  xmm7,xmm3
+
+	cvtps2pi mm2,xmm2
+	cvtps2pi mm3,xmm3
+	cvtps2pi mm6,xmm6
+	cvtps2pi mm7,xmm7
+
+	packssdw mm0,mm4
+	packssdw mm1,mm5
+	packssdw mm2,mm6
+	packssdw mm3,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcsammmx.asm b/jpeg/simd/jcsammmx.asm
new file mode 100644
index 000000000000..9e43b2f85e61
--- /dev/null
+++ b/jpeg/simd/jcsammmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsammmx.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00010000	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mm2,mm0
+	movq	mm3,mm1
+
+	pand	mm0,mm6
+	psrlw	mm2,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm3,BYTE_BIT
+
+	paddw	mm0,mm2
+	paddw	mm1,mm3
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+	psrlw	mm0,1
+	psrlw	mm1,1
+
+	packuswb mm0,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	short .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00020001	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pand	mm0,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	movq	mm4,mm2
+	movq	mm5,mm3
+	pand	mm2,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm3,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm2,mm4
+	paddw	mm3,mm5
+
+	paddw	mm0,mm1
+	paddw	mm2,mm3
+	paddw	mm0,mm7
+	paddw	mm2,mm7
+	psrlw	mm0,2
+	psrlw	mm2,2
+
+	packuswb mm0,mm2
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcsamss2-64.asm b/jpeg/simd/jcsamss2-64.asm
new file mode 100644
index 000000000000..6a16dc5f7ff5
--- /dev/null
+++ b/jpeg/simd/jcsamss2-64.asm
@@ -0,0 +1,330 @@
+;
+; jcsamss2-64.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov rcx, r13
+	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
+	jz	near .return
+
+	mov rdx, r10
+
+	; -- expand_right_edge
+
+	push	rcx
+	shl	rcx,1				; output_cols * 2
+	sub	rcx,rdx
+	jle	short .expand_end
+
+	mov	rax, r11
+	test	rax,rax
+	jle	short .expand_end
+
+	cld
+	mov	rsi, r14	; input_data
+.expandloop:
+	push	rax
+	push	rcx
+
+	mov	rdi, JSAMPROW [rsi]
+	add	rdi,rdx
+	mov	al, JSAMPLE [rdi-1]
+
+	rep stosb
+
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	dec	rax
+	jg	short .expandloop
+
+.expand_end:
+	pop	rcx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	rax, r12	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	rdx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	rsi, r14	; input_data
+	mov	rdi, r15	; output_data
+.rowloop:
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]		; inptr
+	mov rdi, JSAMPROW [rdi]		; outptr
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .downsample
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	rcx, byte SIZEOF_XMMWORD	; outcol
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	rcx,rcx
+	jnz	short .columnloop_r8
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte SIZEOF_JSAMPROW	; output_data
+	dec	rax				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov	rcx, r13
+	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
+	jz	near .return
+
+	mov	rdx, r10
+
+	; -- expand_right_edge
+
+	push	rcx
+	shl	rcx,1				; output_cols * 2
+	sub	rcx,rdx
+	jle	short .expand_end
+
+	mov	rax, r11
+	test	rax,rax
+	jle	short .expand_end
+
+	cld
+	mov	rsi, r14	; input_data
+.expandloop:
+	push	rax
+	push	rcx
+
+	mov	rdi, JSAMPROW [rsi]
+	add	rdi,rdx
+	mov	al, JSAMPLE [rdi-1]
+
+	rep stosb
+
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	dec	rax
+	jg	short .expandloop
+
+.expand_end:
+	pop	rcx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	rax, r12	; rowctr
+	test	rax,rax
+	jle	near .return
+
+	mov	rdx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	rsi, r14	; input_data
+	mov	rdi, r15	; output_data
+.rowloop:
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	rdi, JSAMPROW [rdi]			; outptr
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .downsample
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	rcx, byte SIZEOF_XMMWORD	; outcol
+	add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	rcx,rcx
+	jnz	near .columnloop_r8
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+
+	add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	rax				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jcsamss2.asm b/jpeg/simd/jcsamss2.asm
new file mode 100644
index 000000000000..818e911df16e
--- /dev/null
+++ b/jpeg/simd/jcsamss2.asm
@@ -0,0 +1,351 @@
+;
+; jcsamss2.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	ecx,ecx
+	jnz	short .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdclrmmx.asm b/jpeg/simd/jdclrmmx.asm
new file mode 100644
index 000000000000..79772e0c2726
--- /dev/null
+++ b/jpeg/simd/jdclrmmx.asm
@@ -0,0 +1,407 @@
+;
+; jdclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
+;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                            JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)	(b)+8			; JDIMENSION out_width
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
+	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
+
+	pcmpeqw	mm4,mm4
+	pcmpeqw	mm7,mm7
+	psrlw	mm4,BYTE_BIT
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	mm4,mm5			; mm4=Cb(0246)=CbE
+	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
+	pand	mm0,mm1			; mm0=Cr(0246)=CrE
+	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
+
+	paddw	mm4,mm7
+	paddw	mm5,mm7
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm2,mm4			; mm2=CbE
+	movq	mm3,mm5			; mm3=CbO
+	paddw	mm4,mm4			; mm4=2*CbE
+	paddw	mm5,mm5			; mm5=2*CbO
+	movq	mm6,mm0			; mm6=CrE
+	movq	mm7,mm1			; mm7=CrO
+	paddw	mm0,mm0			; mm0=2*CrE
+	paddw	mm1,mm1			; mm1=2*CrO
+
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
+	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
+	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
+
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	paddw	mm5,[GOTOFF(eax,PW_ONE)]
+	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
+	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	paddw	mm1,[GOTOFF(eax,PW_ONE)]
+	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
+	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
+
+	paddw	mm4,mm2
+	paddw	mm5,mm3
+	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
+	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
+
+	movq      mm4,mm2
+	movq      mm5,mm3
+	punpcklwd mm2,mm6
+	punpckhwd mm4,mm6
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm3,mm7
+	punpckhwd mm5,mm7
+	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm4,SCALEBITS
+	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm3,SCALEBITS
+	psrad     mm5,SCALEBITS
+
+	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
+
+	pcmpeqw   mm4,mm4
+	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      mm4,mm5		; mm4=Y(0246)=YE
+	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
+
+	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .nextrow
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .nextrow
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdclrss2-64.asm b/jpeg/simd/jdclrss2-64.asm
new file mode 100644
index 000000000000..4282bd269f33
--- /dev/null
+++ b/jpeg/simd/jdclrss2-64.asm
@@ -0,0 +1,487 @@
+;
+; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+; r10 = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14 = int num_rows
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+	push	rbx
+
+	mov	rcx, r10	; num_cols
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov	rdi, r11
+	mov	rcx, r12
+	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+	pop	rcx
+
+	mov	rdi, r13
+	mov	eax, r14d
+	test	rax,rax
+	jle	near .return
+.rowloop:
+	push	rax
+	push	rdi
+	push	rdx
+	push	rbx
+	push	rsi
+	push	rcx			; col
+
+	mov	rsi, JSAMPROW [rsi]	; inptr0
+	mov	rbx, JSAMPROW [rbx]	; inptr1
+	mov	rdx, JSAMPROW [rdx]	; inptr2
+	mov	rdi, JSAMPROW [rdi]	; outptr
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [rbx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [rdx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[rel PW_MF0228]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[rel PW_F0402]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[rel PW_ONE]
+	paddw	xmm5,[rel PW_ONE]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[rel PW_ONE]
+	paddw	xmm1,[rel PW_ONE]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[rel PW_MF0344_F0285]
+	pmaddwd   xmm4,[rel PW_MF0344_F0285]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[rel PW_MF0344_F0285]
+	pmaddwd   xmm5,[rel PW_MF0344_F0285]
+
+	paddd     xmm2,[rel PD_ONEHALF]
+	paddd     xmm4,[rel PD_ONEHALF]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[rel PD_ONEHALF]
+	paddd     xmm5,[rel PD_ONEHALF]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [rsi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	rcx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	rcx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	rax,rcx
+	xor	rcx, byte 0x0F
+	shl	rcx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	rax,rcx
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,rcx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	rcx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	rcx, byte SIZEOF_XMMWORD/16
+	jb	near .nextrow
+	mov	rax,rcx
+	xor	rcx, byte 0x03
+	inc	rcx
+	shl	rcx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	rax, [rcx+rax*4]	; RGB_PIXELSIZE
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	rcx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+	pop	rcx
+	pop	rsi
+	pop	rbx
+	pop	rdx
+	pop	rdi
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	add	rbx, byte SIZEOF_JSAMPROW
+	add	rdx, byte SIZEOF_JSAMPROW
+	add	rdi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	rax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdclrss2.asm b/jpeg/simd/jdclrss2.asm
new file mode 100644
index 000000000000..865fa8245d38
--- /dev/null
+++ b/jpeg/simd/jdclrss2.asm
@@ -0,0 +1,505 @@
+;
+; jdclrss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)	(b)+8			; JDIMENSION out_width
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [ebx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [edx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[GOTOFF(eax,PW_MF0228)]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[GOTOFF(eax,PW_F0402)]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm5,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm1,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [esi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .nextrow
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdcolmmx.asm b/jpeg/simd/jdcolmmx.asm
new file mode 100644
index 000000000000..58775e8547dc
--- /dev/null
+++ b/jpeg/simd/jdcolmmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
+%include "jdclrmmx.asm"
diff --git a/jpeg/simd/jdcolss2-64.asm b/jpeg/simd/jdcolss2-64.asm
new file mode 100644
index 000000000000..2e97d5930838
--- /dev/null
+++ b/jpeg/simd/jdcolss2-64.asm
@@ -0,0 +1,117 @@
+;
+; jdcolss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2-64.asm"
diff --git a/jpeg/simd/jdcolss2.asm b/jpeg/simd/jdcolss2.asm
new file mode 100644
index 000000000000..7ae985d6a9af
--- /dev/null
+++ b/jpeg/simd/jdcolss2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2.asm"
diff --git a/jpeg/simd/jdct.inc b/jpeg/simd/jdct.inc
new file mode 100644
index 000000000000..cc6270425840
--- /dev/null
+++ b/jpeg/simd/jdct.inc
@@ -0,0 +1,28 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required.  We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
+
+%define ROW(n,b,s)		((b)+(n)*(s))
+%define COL(n,b,s)		((b)+(n)*(s)*DCTSIZE)
+
+%define DWBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+%define XMMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/jpeg/simd/jdmermmx.asm b/jpeg/simd/jdmermmx.asm
new file mode 100644
index 000000000000..fd587fbc1247
--- /dev/null
+++ b/jpeg/simd/jdmermmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmermmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"
diff --git a/jpeg/simd/jdmerss2-64.asm b/jpeg/simd/jdmerss2-64.asm
new file mode 100644
index 000000000000..1f0b10faac8d
--- /dev/null
+++ b/jpeg/simd/jdmerss2-64.asm
@@ -0,0 +1,123 @@
+;
+; jdmerss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
diff --git a/jpeg/simd/jdmerss2.asm b/jpeg/simd/jdmerss2.asm
new file mode 100644
index 000000000000..2294e0d3ef31
--- /dev/null
+++ b/jpeg/simd/jdmerss2.asm
@@ -0,0 +1,123 @@
+;
+; jdmerss2.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"
diff --git a/jpeg/simd/jdmrgmmx.asm b/jpeg/simd/jdmrgmmx.asm
new file mode 100644
index 000000000000..b5777a3e165a
--- /dev/null
+++ b/jpeg/simd/jdmrgmmx.asm
@@ -0,0 +1,466 @@
+;
+; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [output_width(eax)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
+	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
+
+	pxor      mm1,mm1		; mm1=(all 0's)
+	pcmpeqw   mm3,mm3
+	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	movq      mm4,mm6
+	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
+	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
+	movq      mm0,mm7
+	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
+	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
+
+	paddw     mm6,mm3
+	paddw     mm4,mm3
+	paddw     mm7,mm3
+	paddw     mm0,mm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm5,mm6			; mm5=CbH
+	movq	mm2,mm4			; mm2=CbL
+	paddw	mm6,mm6			; mm6=2*CbH
+	paddw	mm4,mm4			; mm4=2*CbL
+	movq	mm1,mm7			; mm1=CrH
+	movq	mm3,mm0			; mm3=CrL
+	paddw	mm7,mm7			; mm7=2*CrH
+	paddw	mm0,mm0			; mm0=2*CrL
+
+	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
+	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
+
+	paddw	mm6,[GOTOFF(eax,PW_ONE)]
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
+	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
+	paddw	mm7,[GOTOFF(eax,PW_ONE)]
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
+	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
+
+	paddw	mm6,mm5
+	paddw	mm4,mm2
+	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
+	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
+
+	movq      mm6,mm5
+	movq      mm7,mm2
+	punpcklwd mm5,mm1
+	punpckhwd mm6,mm1
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm2,mm3
+	punpckhwd mm7,mm3
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm5,SCALEBITS
+	psrad     mm6,SCALEBITS
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm7,SCALEBITS
+
+	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
+	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
+	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
+
+	pcmpeqw	mm6,mm6
+	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	mm6,mm7			; mm6=Y(0246)=YE
+	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
+
+	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
+	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
+	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
+
+	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	near .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .endcolumn
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .endcolumn
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, JDIMENSION [output_width(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; output_width
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdmrgss2-64.asm b/jpeg/simd/jdmrgss2-64.asm
new file mode 100644
index 000000000000..121bb82bc55a
--- /dev/null
+++ b/jpeg/simd/jdmrgss2-64.asm
@@ -0,0 +1,584 @@
+;
+; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ for
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		3
+
+	align	16
+	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+	push	rbx
+
+	mov	rcx, r10	; col
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov	rdi, r11
+	mov	rcx, r12
+	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+	mov	rdi, r13
+	mov	rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]		; inptr0
+	mov	rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]		; inptr1
+	mov	rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]		; inptr2
+	mov	rdi, JSAMPROW [rdi]				; outptr
+
+	pop	rcx			; col
+
+.columnloop:
+
+	movdqa    xmm6, XMMWORD [rbx]	; xmm6=Cb(0123456789ABCDEF)
+	movdqa    xmm7, XMMWORD [rdx]	; xmm7=Cr(0123456789ABCDEF)
+
+	pxor      xmm1,xmm1		; xmm1=(all 0's)
+	pcmpeqw   xmm3,xmm3
+	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	movdqa    xmm4,xmm6
+	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
+	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
+	movdqa    xmm0,xmm7
+	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
+	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
+
+	paddw     xmm6,xmm3
+	paddw     xmm4,xmm3
+	paddw     xmm7,xmm3
+	paddw     xmm0,xmm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm5,xmm6		; xmm5=CbH
+	movdqa	xmm2,xmm4		; xmm2=CbL
+	paddw	xmm6,xmm6		; xmm6=2*CbH
+	paddw	xmm4,xmm4		; xmm4=2*CbL
+	movdqa	xmm1,xmm7		; xmm1=CrH
+	movdqa	xmm3,xmm0		; xmm3=CrL
+	paddw	xmm7,xmm7		; xmm7=2*CrH
+	paddw	xmm0,xmm0		; xmm0=2*CrL
+
+	pmulhw	xmm6,[rel PW_MF0228]	; xmm6=(2*CbH * -FIX(0.22800))
+	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbL * -FIX(0.22800))
+	pmulhw	xmm7,[rel PW_F0402]	; xmm7=(2*CrH * FIX(0.40200))
+	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrL * FIX(0.40200))
+
+	paddw	xmm6,[rel PW_ONE]
+	paddw	xmm4,[rel PW_ONE]
+	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
+	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
+	paddw	xmm7,[rel PW_ONE]
+	paddw	xmm0,[rel PW_ONE]
+	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
+	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
+
+	paddw	xmm6,xmm5
+	paddw	xmm4,xmm2
+	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
+
+	movdqa    xmm6,xmm5
+	movdqa    xmm7,xmm2
+	punpcklwd xmm5,xmm1
+	punpckhwd xmm6,xmm1
+	pmaddwd   xmm5,[rel PW_MF0344_F0285]
+	pmaddwd   xmm6,[rel PW_MF0344_F0285]
+	punpcklwd xmm2,xmm3
+	punpckhwd xmm7,xmm3
+	pmaddwd   xmm2,[rel PW_MF0344_F0285]
+	pmaddwd   xmm7,[rel PW_MF0344_F0285]
+
+	paddd     xmm5,[rel PD_ONEHALF]
+	paddd     xmm6,[rel PD_ONEHALF]
+	psrad     xmm5,SCALEBITS
+	psrad     xmm6,SCALEBITS
+	paddd     xmm2,[rel PD_ONEHALF]
+	paddd     xmm7,[rel PD_ONEHALF]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm7,SCALEBITS
+
+	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+
+.Yloop_2nd:
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
+
+.Yloop_1st:
+	movdqa	xmm7, XMMWORD [rsi]	; xmm7=Y(0123456789ABCDEF)
+
+	pcmpeqw	xmm6,xmm6
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
+	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
+
+	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
+	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
+	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
+
+	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	rcx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	rcx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	rax,rcx
+	xor	rcx, byte 0x0F
+	shl	rcx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	rax,rcx
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	rcx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	rcx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	rcx, byte SIZEOF_XMMWORD/16
+	jb	near .endcolumn
+	mov	rax,rcx
+	xor	rcx, byte 0x03
+	inc	rcx
+	shl	rcx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	rax, [rcx+rax*4]	; RGB_PIXELSIZE
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	rcx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	sfence		; flush the write buffer
+
+.return:
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+	push	rbx
+
+	mov	rax, r10
+
+	mov	rdi, r11
+	mov	rcx, r12
+	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+	mov	rdi, r13
+	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+	push	rdx			; inptr2
+	push	rbx			; inptr1
+	push	rsi			; inptr00
+	mov	rbx,rsp
+
+	push	rdi
+	push	rcx
+	push	rax
+
+	%ifdef WIN64
+	mov r8, rcx
+	mov r9, rdi
+	mov rcx, rax
+	mov rdx, rbx
+	%else
+	mov rdx, rcx
+	mov rcx, rdi
+	mov	rdi, rax
+	mov rsi, rbx
+	%endif
+
+	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	pop rax
+	pop rcx
+	pop rdi
+	pop rsi
+	pop rbx
+	pop rdx
+
+	add	rdi, byte SIZEOF_JSAMPROW	; outptr1
+	add	rsi, byte SIZEOF_JSAMPROW	; inptr01
+
+	push	rdx			; inptr2
+	push	rbx			; inptr1
+	push	rsi			; inptr00
+	mov	rbx,rsp
+
+	push	rdi
+	push	rcx
+	push	rax
+
+	%ifdef WIN64
+	mov r8, rcx
+	mov r9, rdi
+	mov rcx, rax
+	mov rdx, rbx
+	%else
+	mov rdx, rcx
+	mov rcx, rdi
+	mov	rdi, rax
+	mov rsi, rbx
+	%endif
+
+	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	pop rax
+	pop rcx
+	pop rdi
+	pop rsi
+	pop rbx
+	pop rdx
+
+	pop	rbx
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdmrgss2.asm b/jpeg/simd/jdmrgss2.asm
new file mode 100644
index 000000000000..99b7eb9f0f34
--- /dev/null
+++ b/jpeg/simd/jdmrgss2.asm
@@ -0,0 +1,564 @@
+;
+; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [output_width(eax)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movdqa    xmm6, XMMWORD [ebx]	; xmm6=Cb(0123456789ABCDEF)
+	movdqa    xmm7, XMMWORD [edx]	; xmm7=Cr(0123456789ABCDEF)
+
+	pxor      xmm1,xmm1		; xmm1=(all 0's)
+	pcmpeqw   xmm3,xmm3
+	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	movdqa    xmm4,xmm6
+	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
+	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
+	movdqa    xmm0,xmm7
+	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
+	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
+
+	paddw     xmm6,xmm3
+	paddw     xmm4,xmm3
+	paddw     xmm7,xmm3
+	paddw     xmm0,xmm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm5,xmm6		; xmm5=CbH
+	movdqa	xmm2,xmm4		; xmm2=CbL
+	paddw	xmm6,xmm6		; xmm6=2*CbH
+	paddw	xmm4,xmm4		; xmm4=2*CbL
+	movdqa	xmm1,xmm7		; xmm1=CrH
+	movdqa	xmm3,xmm0		; xmm3=CrL
+	paddw	xmm7,xmm7		; xmm7=2*CrH
+	paddw	xmm0,xmm0		; xmm0=2*CrL
+
+	pmulhw	xmm6,[GOTOFF(eax,PW_MF0228)]	; xmm6=(2*CbH * -FIX(0.22800))
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbL * -FIX(0.22800))
+	pmulhw	xmm7,[GOTOFF(eax,PW_F0402)]	; xmm7=(2*CrH * FIX(0.40200))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrL * FIX(0.40200))
+
+	paddw	xmm6,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
+	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
+	paddw	xmm7,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
+	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
+
+	paddw	xmm6,xmm5
+	paddw	xmm4,xmm2
+	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
+
+	movdqa    xmm6,xmm5
+	movdqa    xmm7,xmm2
+	punpcklwd xmm5,xmm1
+	punpckhwd xmm6,xmm1
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm2,xmm3
+	punpckhwd xmm7,xmm3
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm5,SCALEBITS
+	psrad     xmm6,SCALEBITS
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm7,SCALEBITS
+
+	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movdqa	xmm7, XMMWORD [esi]	; xmm7=Y(0123456789ABCDEF)
+
+	pcmpeqw	xmm6,xmm6
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
+	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
+
+	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
+	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
+	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
+
+	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .endcolumn
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, POINTER [output_width(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; output_width
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdsammmx.asm b/jpeg/simd/jdsammmx.asm
new file mode 100644
index 000000000000..c09e5b96cd19
--- /dev/null
+++ b/jpeg/simd/jdsammmx.asm
@@ -0,0 +1,737 @@
+;
+; jdsammmx.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE		times 4 dw  1
+PW_TWO		times 4 dw  2
+PW_THREE	times 4 dw  3
+PW_SEVEN	times 4 dw  7
+PW_EIGHT	times 4 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	mm0,mm0			; mm0=(all 0's)
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	mm6,mm6
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2,mm1
+	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
+	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
+	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
+
+	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
+	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
+
+	movq	mm7,mm1
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
+
+	movq      mm4,mm1
+	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
+	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
+	movq      mm5,mm2
+	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
+	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
+	movq      mm6,mm3
+	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
+	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
+
+	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	mm2,mm1
+	paddw	mm5,mm4
+	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
+	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
+	paddw	mm3,mm1
+	paddw	mm6,mm4
+	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
+	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
+
+	psllw	mm3,BYTE_BIT
+	psllw	mm6,BYTE_BIT
+	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(edx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
+	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
+	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+	pand	mm1,mm7			; mm1=( 0 - - -)
+	pand	mm2,mm7			; mm2=( 0 - - -)
+
+	movq	MMWORD [wk(0)], mm1
+	movq	MMWORD [wk(1)], mm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	mm1,mm1
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
+	movq	mm2,mm1
+
+	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
+	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
+	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
+	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+.upsample:
+	; -- process the upper row
+
+	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
+	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
+
+	movq	mm0,mm7
+	movq	mm4,mm3
+	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
+	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
+	movq	mm5,mm7
+	movq	mm6,mm3
+	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
+	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
+
+	por	mm0,mm4				; mm0=( 1 2 3 4)
+	por	mm5,mm6				; mm5=( 3 4 5 6)
+
+	movq	mm1,mm7
+	movq	mm2,mm3
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
+	movq	mm4,mm3
+	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
+
+	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
+	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
+
+	movq	MMWORD [wk(0)], mm4
+
+	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm7
+	paddw	mm5,mm3
+	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
+	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
+	paddw	mm0,mm7
+	paddw	mm2,mm3
+	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
+	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
+
+	psllw	mm0,BYTE_BIT
+	psllw	mm2,BYTE_BIT
+	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+	; -- process the lower row
+
+	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
+	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
+
+	movq	mm7,mm6
+	movq	mm3,mm4
+	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
+	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
+	movq	mm0,mm6
+	movq	mm2,mm4
+	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
+	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
+
+	por	mm7,mm3				; mm7=( 1 2 3 4)
+	por	mm0,mm2				; mm0=( 3 4 5 6)
+
+	movq	mm1,mm6
+	movq	mm5,mm4
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
+	movq	mm3,mm4
+	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
+
+	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
+	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
+
+	movq	MMWORD [wk(1)], mm3
+
+	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm6
+	paddw	mm0,mm4
+	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
+	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
+	paddw	mm7,mm6
+	paddw	mm5,mm4
+	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
+	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
+
+	psllw	mm7,BYTE_BIT
+	psllw	mm5,BYTE_BIT
+	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	short .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdsamss2-64.asm b/jpeg/simd/jdsamss2-64.asm
new file mode 100644
index 000000000000..f36c15622581
--- /dev/null
+++ b/jpeg/simd/jdsamss2-64.asm
@@ -0,0 +1,671 @@
+;
+; jdsamss2-64.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE		times 8 dw  1
+PW_TWO		times 8 dw  2
+PW_THREE	times 8 dw  3
+PW_SEVEN	times 8 dw  7
+PW_EIGHT	times 8 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov	rax, r11  ; colctr
+	test	rax,rax
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	near .return
+
+	mov	rsi, r12	; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rax			; colctr
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]	; inptr
+	mov	rdi, JSAMPROW [rdi]	; outptr
+
+	test	rax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)
+	pand	xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+	add	rax, byte SIZEOF_XMMWORD-1
+	and	rax, byte -SIZEOF_XMMWORD
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+
+.columnloop_last:
+	pcmpeqb	xmm6,xmm6
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+	pand	xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	jmp	short .upsample
+
+.columnloop:
+	movdqa	xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2,xmm1
+	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
+	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
+	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
+
+	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
+	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
+
+	movdqa	xmm7,xmm1
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
+	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
+	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
+
+	pmullw	xmm1,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+	paddw	xmm2,[rel PW_ONE]
+	paddw	xmm5,[rel PW_ONE]
+	paddw	xmm3,[rel PW_TWO]
+	paddw	xmm6,[rel PW_TWO]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm3,BYTE_BIT
+	psllw	xmm6,BYTE_BIT
+	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+	sub	rax, byte SIZEOF_XMMWORD
+	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	rsi
+	pop	rdi
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte SIZEOF_JSAMPROW	; output_data
+	dec	rcx				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		4
+
+	align	16
+	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+	push	rbx
+
+	mov	rax, r11  ; colctr
+	test	rax,rax
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	near .return
+
+	mov	rsi, r12	; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rax					; colctr
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	rax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	push	rdx
+	mov	dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	rdx
+.skip:
+	; -- process the first column block
+
+	movdqa	xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
+	movdqa	xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
+	movdqa	xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-2)
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
+	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
+
+	movdqa	XMMWORD [wk(0)], xmm1
+	movdqa	XMMWORD [wk(1)], xmm2
+
+	add	rax, byte SIZEOF_XMMWORD-1
+	and	rax, byte -SIZEOF_XMMWORD
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+
+.columnloop_last:
+	; -- process the last column block
+
+	pcmpeqb	xmm1,xmm1
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)
+	movdqa	xmm2,xmm1
+
+	pand	xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+	pand	xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
+	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
+
+	jmp	near .upsample
+
+.columnloop:
+	; -- process the next column block
+
+	movdqa	xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
+	movdqa	xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
+	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
+	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
+
+	movdqa	XMMWORD [wk(2)], xmm1
+	movdqa	XMMWORD [wk(3)], xmm2
+
+.upsample:
+	; -- process the upper row
+
+	movdqa	xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
+	movdqa	xmm5,xmm7
+	movdqa	xmm6,xmm3
+	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
+
+	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
+	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm2,xmm3
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm4,xmm3
+	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(0)], xmm4
+
+	pmullw	xmm7,[rel PW_THREE]
+	pmullw	xmm3,[rel PW_THREE]
+	paddw	xmm1,[rel PW_EIGHT]
+	paddw	xmm5,[rel PW_EIGHT]
+	paddw	xmm0,[rel PW_SEVEN]
+	paddw	xmm2,[rel PW_SEVEN]
+
+	paddw	xmm1,xmm7
+	paddw	xmm5,xmm3
+	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm3
+	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm0,BYTE_BIT
+	psllw	xmm2,BYTE_BIT
+	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+	; -- process the lower row
+
+	movdqa	xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+	movdqa	xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
+	movdqa	xmm0,xmm6
+	movdqa	xmm2,xmm4
+	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
+
+	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
+	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm4
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm3,xmm4
+	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(1)], xmm3
+
+	pmullw	xmm6,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+	paddw	xmm1,[rel PW_EIGHT]
+	paddw	xmm0,[rel PW_EIGHT]
+	paddw	xmm7,[rel PW_SEVEN]
+	paddw	xmm5,[rel PW_SEVEN]
+
+	paddw	xmm1,xmm6
+	paddw	xmm0,xmm4
+	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm7,xmm6
+	paddw	xmm5,xmm4
+	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm7,BYTE_BIT
+	psllw	xmm5,BYTE_BIT
+	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+	sub	rax, byte SIZEOF_XMMWORD
+	add	rcx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	rbx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	rdx, byte 2*SIZEOF_XMMWORD	; outptr0
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr1
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	rax,rax
+	jnz	near .columnloop_last
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	rcx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov	rdx, r11
+	add	rdx, byte (2*SIZEOF_XMMWORD)-1
+	and	rdx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	short .return
+
+	mov	rsi, r12 ; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]		; inptr
+	mov	rdi, JSAMPROW [rdi]		; outptr
+	mov	rax,rdx				; colctr
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr
+	jmp	short .columnloop
+
+.nextrow:
+	pop	rsi
+	pop	rdi
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte SIZEOF_JSAMPROW	; output_data
+	dec	rcx				; rowctr
+	jg	short .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+	push	rbx
+
+	mov	rdx, r11
+	add	rdx, byte (2*SIZEOF_XMMWORD)-1
+	and	rdx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	near .return
+
+	mov	rsi, r12	; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]			; inptr
+	mov	rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	rax,rdx					; colctr
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	rbx, byte 4*SIZEOF_XMMWORD	; outptr0
+	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr1
+	jmp	short .columnloop
+
+.nextrow:
+	pop	rsi
+	pop	rdi
+
+	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	rcx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	rbx
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jdsamss2.asm b/jpeg/simd/jdsamss2.asm
new file mode 100644
index 000000000000..b5c863b4632d
--- /dev/null
+++ b/jpeg/simd/jdsamss2.asm
@@ -0,0 +1,729 @@
+;
+; jdsamss2.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE		times 8 dw  1
+PW_TWO		times 8 dw  2
+PW_THREE	times 8 dw  3
+PW_SEVEN	times 8 dw  7
+PW_EIGHT	times 8 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)
+	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	xmm6,xmm6
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2,xmm1
+	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
+	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
+	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
+
+	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
+	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
+
+	movdqa	xmm7,xmm1
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
+	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
+	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
+
+	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm3,BYTE_BIT
+	psllw	xmm6,BYTE_BIT
+	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(edx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
+	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
+	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-2)
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
+	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
+
+	movdqa	XMMWORD [wk(0)], xmm1
+	movdqa	XMMWORD [wk(1)], xmm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	xmm1,xmm1
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)
+	movdqa	xmm2,xmm1
+
+	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
+	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
+
+	jmp	near .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
+	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
+	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
+
+	movdqa	XMMWORD [wk(2)], xmm1
+	movdqa	XMMWORD [wk(3)], xmm2
+
+.upsample:
+	; -- process the upper row
+
+	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
+	movdqa	xmm5,xmm7
+	movdqa	xmm6,xmm3
+	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
+
+	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
+	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm2,xmm3
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm4,xmm3
+	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(0)], xmm4
+
+	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm7
+	paddw	xmm5,xmm3
+	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm3
+	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm0,BYTE_BIT
+	psllw	xmm2,BYTE_BIT
+	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+	; -- process the lower row
+
+	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
+	movdqa	xmm0,xmm6
+	movdqa	xmm2,xmm4
+	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
+
+	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
+	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm4
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm3,xmm4
+	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(1)], xmm3
+
+	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm6
+	paddw	xmm0,xmm4
+	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm7,xmm6
+	paddw	xmm5,xmm4
+	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm7,BYTE_BIT
+	psllw	xmm5,BYTE_BIT
+	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	short .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jf3dnflt.asm b/jpeg/simd/jf3dnflt.asm
new file mode 100644
index 000000000000..542672dc5498
--- /dev/null
+++ b/jpeg/simd/jf3dnflt.asm
@@ -0,0 +1,320 @@
+;
+; jf3dnflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382	times 2 dd  0.382683432365089771728460
+PD_0_707	times 2 dd  0.707106781186547524400844
+PD_0_541	times 2 dd  0.541196100146196984399723
+PD_1_306	times 2 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 10)=data0
+	punpckhdq mm4,mm1		; mm4=(01 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(06 16)=data6
+	punpckhdq mm5,mm3		; mm5=(07 17)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(02 12)=data2
+	punpckhdq mm4,mm3		; mm4=(03 13)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(04 14)=data4
+	punpckhdq mm0,mm5		; mm0=(05 15)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 01)=data0
+	punpckhdq mm4,mm1		; mm4=(10 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(60 61)=data6
+	punpckhdq mm5,mm3		; mm5=(70 71)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(20 21)=data2
+	punpckhdq mm4,mm3		; mm4=(30 31)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(40 41)=data4
+	punpckhdq mm0,mm5		; mm0=(50 51)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+	femms		; empty MMX/3DNow! state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfmmxfst.asm b/jpeg/simd/jfmmxfst.asm
new file mode 100644
index 000000000000..0647242a92a8
--- /dev/null
+++ b/jpeg/simd/jfmmxfst.asm
@@ -0,0 +1,397 @@
+;
+; jfmmxfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707	times 4 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 4 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 4 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 4 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfmmxint.asm b/jpeg/simd/jfmmxint.asm
new file mode 100644
index 000000000000..a7e73f73a204
--- /dev/null
+++ b/jpeg/simd/jfmmxint.asm
@@ -0,0 +1,622 @@
+;
+; jfmmxint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 4 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	psllw	mm5,PASS1_BITS		; mm5=data0
+	psllw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm1,DESCALE_P1
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm2,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm3,DESCALE_P1
+	psrad	mm5,DESCALE_P1
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	paddw	mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	mm5,PASS1_BITS		; mm5=data0
+	psraw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm6,DESCALE_P2
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm2,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm3,DESCALE_P2
+	psrad	mm5,DESCALE_P2
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfss2fst-64.asm b/jpeg/simd/jfss2fst-64.asm
new file mode 100644
index 000000000000..6953caf3b4f3
--- /dev/null
+++ b/jpeg/simd/jfss2fst-64.asm
@@ -0,0 +1,392 @@
+;
+; jfss2fst-64.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process rows.
+
+	mov	rdx, r10	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	psubw	xmm3,xmm1		; xmm3=tmp13
+	psubw	xmm6,xmm7		; xmm6=tmp12
+	paddw	xmm4,xmm1		; xmm4=tmp10
+	paddw	xmm0,xmm7		; xmm0=tmp11
+
+	paddw	xmm6,xmm3
+	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm6,[rel PW_F0707] ; xmm6=z1
+
+	movdqa	xmm1,xmm4
+	movdqa	xmm7,xmm3
+	psubw	xmm4,xmm0		; xmm4=data4
+	psubw	xmm3,xmm6		; xmm3=data6
+	paddw	xmm1,xmm0		; xmm1=data0
+	paddw	xmm7,xmm6		; xmm7=data2
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
+
+	; -- Odd part
+
+	paddw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm5,xmm0		; xmm5=tmp11
+	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z3
+
+	movdqa	xmm4,xmm2		; xmm4=tmp10
+	psubw	xmm2,xmm0
+	pmulhw	xmm2,[rel PW_F0382] ; xmm2=z5
+	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm2		; xmm4=z2
+	paddw	xmm0,xmm2		; xmm0=z4
+
+	movdqa	xmm3,xmm6
+	psubw	xmm6,xmm5		; xmm6=z13
+	paddw	xmm3,xmm5		; xmm3=z11
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm5,xmm3
+	psubw	xmm6,xmm4		; xmm6=data3
+	psubw	xmm3,xmm0		; xmm3=data7
+	paddw	xmm2,xmm4		; xmm2=data5
+	paddw	xmm5,xmm0		; xmm5=data1
+
+	; ---- Pass 2: process columns.
+
+	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
+	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
+
+	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
+	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
+	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
+	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
+	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm3,xmm1
+	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
+	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
+	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
+	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
+
+	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm7,xmm6
+	movdqa	xmm0,xmm2
+	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
+	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
+	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
+	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm1,xmm5
+	psubw	xmm3,xmm6		; xmm3=tmp13
+	psubw	xmm5,xmm2		; xmm5=tmp12
+	paddw	xmm4,xmm6		; xmm4=tmp10
+	paddw	xmm1,xmm2		; xmm1=tmp11
+
+	paddw	xmm5,xmm3
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z1
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm2,xmm3
+	psubw	xmm4,xmm1		; xmm4=data4
+	psubw	xmm3,xmm5		; xmm3=data6
+	paddw	xmm6,xmm1		; xmm6=data0
+	paddw	xmm2,xmm5		; xmm2=data2
+
+	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+	; -- Odd part
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	paddw	xmm7,xmm0		; xmm7=tmp10
+	paddw	xmm0,xmm1		; xmm0=tmp11
+	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
+
+	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm0,[rel PW_F0707] ; xmm0=z3
+
+	movdqa	xmm4,xmm7		; xmm4=tmp10
+	psubw	xmm7,xmm1
+	pmulhw	xmm7,[rel PW_F0382] ; xmm7=z5
+	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm7		; xmm4=z2
+	paddw	xmm1,xmm7		; xmm1=z4
+
+	movdqa	xmm3,xmm5
+	psubw	xmm5,xmm0		; xmm5=z13
+	paddw	xmm3,xmm0		; xmm3=z11
+
+	movdqa	xmm6,xmm5
+	movdqa	xmm2,xmm3
+	psubw	xmm5,xmm4		; xmm5=data3
+	psubw	xmm3,xmm1		; xmm3=data7
+	paddw	xmm6,xmm4		; xmm6=data5
+	paddw	xmm2,xmm1		; xmm2=data1
+
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfss2fst.asm b/jpeg/simd/jfss2fst.asm
new file mode 100644
index 000000000000..73fc9e51a61b
--- /dev/null
+++ b/jpeg/simd/jfss2fst.asm
@@ -0,0 +1,404 @@
+;
+; jfss2fst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	psubw	xmm3,xmm1		; xmm3=tmp13
+	psubw	xmm6,xmm7		; xmm6=tmp12
+	paddw	xmm4,xmm1		; xmm4=tmp10
+	paddw	xmm0,xmm7		; xmm0=tmp11
+
+	paddw	xmm6,xmm3
+	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+	movdqa	xmm1,xmm4
+	movdqa	xmm7,xmm3
+	psubw	xmm4,xmm0		; xmm4=data4
+	psubw	xmm3,xmm6		; xmm3=data6
+	paddw	xmm1,xmm0		; xmm1=data0
+	paddw	xmm7,xmm6		; xmm7=data2
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
+
+	; -- Odd part
+
+	paddw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm5,xmm0		; xmm5=tmp11
+	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+	movdqa	xmm4,xmm2		; xmm4=tmp10
+	psubw	xmm2,xmm0
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm2		; xmm4=z2
+	paddw	xmm0,xmm2		; xmm0=z4
+
+	movdqa	xmm3,xmm6
+	psubw	xmm6,xmm5		; xmm6=z13
+	paddw	xmm3,xmm5		; xmm3=z11
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm5,xmm3
+	psubw	xmm6,xmm4		; xmm6=data3
+	psubw	xmm3,xmm0		; xmm3=data7
+	paddw	xmm2,xmm4		; xmm2=data5
+	paddw	xmm5,xmm0		; xmm5=data1
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
+	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
+
+	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
+	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
+	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
+	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
+	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm3,xmm1
+	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
+	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
+	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
+	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
+
+	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm7,xmm6
+	movdqa	xmm0,xmm2
+	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
+	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
+	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
+	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm1,xmm5
+	psubw	xmm3,xmm6		; xmm3=tmp13
+	psubw	xmm5,xmm2		; xmm5=tmp12
+	paddw	xmm4,xmm6		; xmm4=tmp10
+	paddw	xmm1,xmm2		; xmm1=tmp11
+
+	paddw	xmm5,xmm3
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm2,xmm3
+	psubw	xmm4,xmm1		; xmm4=data4
+	psubw	xmm3,xmm5		; xmm3=data6
+	paddw	xmm6,xmm1		; xmm6=data0
+	paddw	xmm2,xmm5		; xmm2=data2
+
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+	; -- Odd part
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	paddw	xmm7,xmm0		; xmm7=tmp10
+	paddw	xmm0,xmm1		; xmm0=tmp11
+	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
+
+	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+	movdqa	xmm4,xmm7		; xmm4=tmp10
+	psubw	xmm7,xmm1
+	pmulhw	xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm7		; xmm4=z2
+	paddw	xmm1,xmm7		; xmm1=z4
+
+	movdqa	xmm3,xmm5
+	psubw	xmm5,xmm0		; xmm5=z13
+	paddw	xmm3,xmm0		; xmm3=z11
+
+	movdqa	xmm6,xmm5
+	movdqa	xmm2,xmm3
+	psubw	xmm5,xmm4		; xmm5=data3
+	psubw	xmm3,xmm1		; xmm3=data7
+	paddw	xmm6,xmm4		; xmm6=data5
+	paddw	xmm2,xmm1		; xmm2=data1
+
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfss2int-64.asm b/jpeg/simd/jfss2int-64.asm
new file mode 100644
index 000000000000..bd1bd45abe56
--- /dev/null
+++ b/jpeg/simd/jfss2int-64.asm
@@ -0,0 +1,622 @@
+;
+; jfss2int-64.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		6
+
+	align	16
+	global	EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process rows.
+
+	mov	rdx, r10	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	paddw	xmm3,xmm1		; xmm3=tmp10
+	paddw	xmm6,xmm7		; xmm6=tmp11
+	psubw	xmm4,xmm1		; xmm4=tmp13
+	psubw	xmm0,xmm7		; xmm0=tmp12
+
+	movdqa	xmm1,xmm3
+	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
+	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
+
+	psllw	xmm3,PASS1_BITS		; xmm3=data0
+	psllw	xmm1,PASS1_BITS		; xmm1=data4
+
+	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
+	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm7,xmm4		; xmm4=tmp13
+	movdqa    xmm6,xmm4
+	punpcklwd xmm7,xmm0		; xmm0=tmp12
+	punpckhwd xmm6,xmm0
+	movdqa    xmm4,xmm7
+	movdqa    xmm0,xmm6
+	pmaddwd   xmm7,[rel PW_F130_F054]	; xmm7=data2L
+	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=data2H
+	pmaddwd   xmm4,[rel PW_F054_MF130]	; xmm4=data6L
+	pmaddwd   xmm0,[rel PW_F054_MF130]	; xmm0=data6H
+
+	paddd	xmm7,[rel PD_DESCALE_P1]
+	paddd	xmm6,[rel PD_DESCALE_P1]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+	paddd	xmm4,[rel PD_DESCALE_P1]
+	paddd	xmm0,[rel PD_DESCALE_P1]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm7,xmm6		; xmm7=data2
+	packssdw  xmm4,xmm0		; xmm4=data6
+
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
+
+	; -- Odd part
+
+	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
+
+	movdqa	xmm6,xmm2		; xmm2=tmp4
+	movdqa	xmm0,xmm5		; xmm5=tmp5
+	paddw	xmm6,xmm3		; xmm6=z3
+	paddw	xmm0,xmm1		; xmm0=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm7,xmm6
+	movdqa    xmm4,xmm6
+	punpcklwd xmm7,xmm0
+	punpckhwd xmm4,xmm0
+	movdqa    xmm6,xmm7
+	movdqa    xmm0,xmm4
+	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3L
+	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3H
+	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4L
+	pmaddwd   xmm0,[rel PW_F117_F078]	; xmm0=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm7,xmm2
+	movdqa    xmm4,xmm2
+	punpcklwd xmm7,xmm1
+	punpckhwd xmm4,xmm1
+	movdqa    xmm2,xmm7
+	movdqa    xmm1,xmm4
+	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp4L
+	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4H
+	pmaddwd   xmm2,[rel PW_MF089_F060]	; xmm2=tmp7L
+	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp7H
+
+	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
+	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
+	paddd	xmm2,xmm6		; xmm2=data1L
+	paddd	xmm1,xmm0		; xmm1=data1H
+
+	paddd	xmm7,[rel PD_DESCALE_P1]
+	paddd	xmm4,[rel PD_DESCALE_P1]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm2,[rel PD_DESCALE_P1]
+	paddd	xmm1,[rel PD_DESCALE_P1]
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+
+	packssdw  xmm7,xmm4		; xmm7=data7
+	packssdw  xmm2,xmm1		; xmm2=data1
+
+	movdqa    xmm4,xmm5
+	movdqa    xmm1,xmm5
+	punpcklwd xmm4,xmm3
+	punpckhwd xmm1,xmm3
+	movdqa    xmm5,xmm4
+	movdqa    xmm3,xmm1
+	pmaddwd   xmm4,[rel PW_MF050_MF256]	; xmm4=tmp5L
+	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5H
+	pmaddwd   xmm5,[rel PW_MF256_F050]	; xmm5=tmp6L
+	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6H
+
+	paddd	xmm4,xmm6		; xmm4=data5L
+	paddd	xmm1,xmm0		; xmm1=data5H
+	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
+	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
+
+	paddd	xmm4,[rel PD_DESCALE_P1]
+	paddd	xmm1,[rel PD_DESCALE_P1]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+	paddd	xmm5,[rel PD_DESCALE_P1]
+	paddd	xmm3,[rel PD_DESCALE_P1]
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+
+	packssdw  xmm4,xmm1		; xmm4=data5
+	packssdw  xmm5,xmm3		; xmm5=data3
+
+	; ---- Pass 2: process columns.
+
+	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
+	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
+
+	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
+	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
+	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
+
+	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
+	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
+	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm7,xmm6
+	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
+	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
+	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
+
+	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
+	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
+	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
+	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm6,xmm2
+	paddw	xmm7,xmm5		; xmm7=tmp10
+	paddw	xmm2,xmm4		; xmm2=tmp11
+	psubw	xmm1,xmm5		; xmm1=tmp13
+	psubw	xmm6,xmm4		; xmm6=tmp12
+
+	movdqa	xmm5,xmm7
+	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
+	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
+
+	paddw	xmm7,[rel PW_DESCALE_P2X]
+	paddw	xmm5,[rel PW_DESCALE_P2X]
+	psraw	xmm7,PASS1_BITS		; xmm7=data0
+	psraw	xmm5,PASS1_BITS		; xmm5=data4
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm4,xmm1		; xmm1=tmp13
+	movdqa    xmm2,xmm1
+	punpcklwd xmm4,xmm6		; xmm6=tmp12
+	punpckhwd xmm2,xmm6
+	movdqa    xmm1,xmm4
+	movdqa    xmm6,xmm2
+	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=data2L
+	pmaddwd   xmm2,[rel PW_F130_F054]	; xmm2=data2H
+	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=data6L
+	pmaddwd   xmm6,[rel PW_F054_MF130]	; xmm6=data6H
+
+	paddd	xmm4,[rel PD_DESCALE_P2]
+	paddd	xmm2,[rel PD_DESCALE_P2]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm1,[rel PD_DESCALE_P2]
+	paddd	xmm6,[rel PD_DESCALE_P2]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm6,DESCALE_P2
+
+	packssdw  xmm4,xmm2		; xmm4=data2
+	packssdw  xmm1,xmm6		; xmm1=data6
+
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+	; -- Odd part
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	movdqa	xmm2,xmm0		; xmm0=tmp4
+	movdqa	xmm6,xmm3		; xmm3=tmp5
+	paddw	xmm2,xmm7		; xmm2=z3
+	paddw	xmm6,xmm5		; xmm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm1,xmm2
+	punpcklwd xmm4,xmm6
+	punpckhwd xmm1,xmm6
+	movdqa    xmm2,xmm4
+	movdqa    xmm6,xmm1
+	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3L
+	pmaddwd   xmm1,[rel PW_MF078_F117]	; xmm1=z3H
+	pmaddwd   xmm2,[rel PW_F117_F078]	; xmm2=z4L
+	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm1,xmm0
+	punpcklwd xmm4,xmm5
+	punpckhwd xmm1,xmm5
+	movdqa    xmm0,xmm4
+	movdqa    xmm5,xmm1
+	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4L
+	pmaddwd   xmm1,[rel PW_MF060_MF089]	; xmm1=tmp4H
+	pmaddwd   xmm0,[rel PW_MF089_F060]	; xmm0=tmp7L
+	pmaddwd   xmm5,[rel PW_MF089_F060]	; xmm5=tmp7H
+
+	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
+	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
+	paddd	xmm0,xmm2		; xmm0=data1L
+	paddd	xmm5,xmm6		; xmm5=data1H
+
+	paddd	xmm4,[rel PD_DESCALE_P2]
+	paddd	xmm1,[rel PD_DESCALE_P2]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm0,[rel PD_DESCALE_P2]
+	paddd	xmm5,[rel PD_DESCALE_P2]
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+
+	packssdw  xmm4,xmm1		; xmm4=data7
+	packssdw  xmm0,xmm5		; xmm0=data1
+
+	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+	movdqa    xmm1,xmm3
+	movdqa    xmm5,xmm3
+	punpcklwd xmm1,xmm7
+	punpckhwd xmm5,xmm7
+	movdqa    xmm3,xmm1
+	movdqa    xmm7,xmm5
+	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5L
+	pmaddwd   xmm5,[rel PW_MF050_MF256]	; xmm5=tmp5H
+	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6L
+	pmaddwd   xmm7,[rel PW_MF256_F050]	; xmm7=tmp6H
+
+	paddd	xmm1,xmm2		; xmm1=data5L
+	paddd	xmm5,xmm6		; xmm5=data5H
+	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
+	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
+
+	paddd	xmm1,[rel PD_DESCALE_P2]
+	paddd	xmm5,[rel PD_DESCALE_P2]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+	paddd	xmm3,[rel PD_DESCALE_P2]
+	paddd	xmm7,[rel PD_DESCALE_P2]
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm1,xmm5		; xmm1=data5
+	packssdw  xmm3,xmm7		; xmm3=data3
+
+	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfss2int.asm b/jpeg/simd/jfss2int.asm
new file mode 100644
index 000000000000..5e3f2aaa9549
--- /dev/null
+++ b/jpeg/simd/jfss2int.asm
@@ -0,0 +1,634 @@
+;
+; jfss2int.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		6
+
+	align	16
+	global	EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	paddw	xmm3,xmm1		; xmm3=tmp10
+	paddw	xmm6,xmm7		; xmm6=tmp11
+	psubw	xmm4,xmm1		; xmm4=tmp13
+	psubw	xmm0,xmm7		; xmm0=tmp12
+
+	movdqa	xmm1,xmm3
+	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
+	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
+
+	psllw	xmm3,PASS1_BITS		; xmm3=data0
+	psllw	xmm1,PASS1_BITS		; xmm1=data4
+
+	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
+	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm7,xmm4		; xmm4=tmp13
+	movdqa    xmm6,xmm4
+	punpcklwd xmm7,xmm0		; xmm0=tmp12
+	punpckhwd xmm6,xmm0
+	movdqa    xmm4,xmm7
+	movdqa    xmm0,xmm6
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]	; xmm7=data2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=data2H
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]	; xmm4=data6L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]	; xmm0=data6H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm7,xmm6		; xmm7=data2
+	packssdw  xmm4,xmm0		; xmm4=data6
+
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
+
+	; -- Odd part
+
+	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
+
+	movdqa	xmm6,xmm2		; xmm2=tmp4
+	movdqa	xmm0,xmm5		; xmm5=tmp5
+	paddw	xmm6,xmm3		; xmm6=z3
+	paddw	xmm0,xmm1		; xmm0=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm7,xmm6
+	movdqa    xmm4,xmm6
+	punpcklwd xmm7,xmm0
+	punpckhwd xmm4,xmm0
+	movdqa    xmm6,xmm7
+	movdqa    xmm0,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3H
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]	; xmm0=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm7,xmm2
+	movdqa    xmm4,xmm2
+	punpcklwd xmm7,xmm1
+	punpckhwd xmm4,xmm1
+	movdqa    xmm2,xmm7
+	movdqa    xmm1,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]	; xmm2=tmp7L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp7H
+
+	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
+	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
+	paddd	xmm2,xmm6		; xmm2=data1L
+	paddd	xmm1,xmm0		; xmm1=data1H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+
+	packssdw  xmm7,xmm4		; xmm7=data7
+	packssdw  xmm2,xmm1		; xmm2=data1
+
+	movdqa    xmm4,xmm5
+	movdqa    xmm1,xmm5
+	punpcklwd xmm4,xmm3
+	punpckhwd xmm1,xmm3
+	movdqa    xmm5,xmm4
+	movdqa    xmm3,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm4=tmp5L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]	; xmm5=tmp6L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6H
+
+	paddd	xmm4,xmm6		; xmm4=data5L
+	paddd	xmm1,xmm0		; xmm1=data5H
+	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
+	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+
+	packssdw  xmm4,xmm1		; xmm4=data5
+	packssdw  xmm5,xmm3		; xmm5=data3
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
+	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
+
+	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
+	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
+	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
+
+	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
+	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
+	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm7,xmm6
+	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
+	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
+	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
+
+	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
+	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
+	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
+	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm6,xmm2
+	paddw	xmm7,xmm5		; xmm7=tmp10
+	paddw	xmm2,xmm4		; xmm2=tmp11
+	psubw	xmm1,xmm5		; xmm1=tmp13
+	psubw	xmm6,xmm4		; xmm6=tmp12
+
+	movdqa	xmm5,xmm7
+	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
+	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
+
+	paddw	xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	xmm7,PASS1_BITS		; xmm7=data0
+	psraw	xmm5,PASS1_BITS		; xmm5=data4
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm4,xmm1		; xmm1=tmp13
+	movdqa    xmm2,xmm1
+	punpcklwd xmm4,xmm6		; xmm6=tmp12
+	punpckhwd xmm2,xmm6
+	movdqa    xmm1,xmm4
+	movdqa    xmm6,xmm2
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=data2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]	; xmm2=data2H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=data6L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]	; xmm6=data6H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm6,DESCALE_P2
+
+	packssdw  xmm4,xmm2		; xmm4=data2
+	packssdw  xmm1,xmm6		; xmm1=data6
+
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+	; -- Odd part
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	movdqa	xmm2,xmm0		; xmm0=tmp4
+	movdqa	xmm6,xmm3		; xmm3=tmp5
+	paddw	xmm2,xmm7		; xmm2=z3
+	paddw	xmm6,xmm5		; xmm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm1,xmm2
+	punpcklwd xmm4,xmm6
+	punpckhwd xmm1,xmm6
+	movdqa    xmm2,xmm4
+	movdqa    xmm6,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]	; xmm1=z3H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]	; xmm2=z4L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm1,xmm0
+	punpcklwd xmm4,xmm5
+	punpckhwd xmm1,xmm5
+	movdqa    xmm0,xmm4
+	movdqa    xmm5,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm1=tmp4H
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]	; xmm0=tmp7L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]	; xmm5=tmp7H
+
+	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
+	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
+	paddd	xmm0,xmm2		; xmm0=data1L
+	paddd	xmm5,xmm6		; xmm5=data1H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+
+	packssdw  xmm4,xmm1		; xmm4=data7
+	packssdw  xmm0,xmm5		; xmm0=data1
+
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+	movdqa    xmm1,xmm3
+	movdqa    xmm5,xmm3
+	punpcklwd xmm1,xmm7
+	punpckhwd xmm5,xmm7
+	movdqa    xmm3,xmm1
+	movdqa    xmm7,xmm5
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm5=tmp5H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]	; xmm7=tmp6H
+
+	paddd	xmm1,xmm2		; xmm1=data5L
+	paddd	xmm5,xmm6		; xmm5=data5H
+	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
+	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
+
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm1,xmm5		; xmm1=data5
+	packssdw  xmm3,xmm7		; xmm3=data3
+
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfsseflt-64.asm b/jpeg/simd/jfsseflt-64.asm
new file mode 100644
index 000000000000..07245d2ddba4
--- /dev/null
+++ b/jpeg/simd/jfsseflt-64.asm
@@ -0,0 +1,358 @@
+;
+; jfsseflt-64.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382	times 4 dd  0.382683432365089771728460
+PD_0_707	times 4 dd  0.707106781186547524400844
+PD_0_541	times 4 dd  0.541196100146196984399723
+PD_1_306	times 4 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT * data)
+;
+
+; r10 = FAST_FLOAT * data
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process rows.
+
+	mov	rdx, r10	; (FAST_FLOAT *)
+	mov	rcx, DCTSIZE/4
+.rowloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
+	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
+	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
+	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
+	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	rcx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	rdx, r10	; (FAST_FLOAT *)
+	mov	rcx, DCTSIZE/4
+.columnloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
+	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
+	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
+	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
+	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
+	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	rdx, byte 4*SIZEOF_FAST_FLOAT
+	dec	rcx
+	jnz	near .columnloop
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jfsseflt.asm b/jpeg/simd/jfsseflt.asm
new file mode 100644
index 000000000000..bc54cccdeedb
--- /dev/null
+++ b/jpeg/simd/jfsseflt.asm
@@ -0,0 +1,370 @@
+;
+; jfsseflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382	times 4 dd  0.382683432365089771728460
+PD_0_707	times 4 dd  0.707106781186547524400844
+PD_0_541	times 4 dd  0.541196100146196984399723
+PD_1_306	times 4 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
+	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
+	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
+	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
+	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
+	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
+	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
+	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, byte 4*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/ji3dnflt.asm b/jpeg/simd/ji3dnflt.asm
new file mode 100644
index 000000000000..dc2076f412ad
--- /dev/null
+++ b/jpeg/simd/ji3dnflt.asm
@@ -0,0 +1,452 @@
+;
+; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414	times 2 dd  1.414213562373095048801689
+PD_1_847	times 2 dd  1.847759065022573512256366
+PD_1_082	times 2 dd  1.082392200292393968799446
+PD_2_613	times 2 dd  2.613125929752753055713286
+PD_RNDINT_MAGIC	times 2 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block,
+;                         JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	pushpic	ebx		; save GOT address
+	mov	ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	mov	eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	or	eax,ebx
+	poppic	ebx		; restore GOT address
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0
+	punpckhdq mm1,mm1
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	punpcklwd mm1,mm1
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+	pi2fd     mm1,mm1
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm5,mm5
+	punpcklwd mm1,mm1
+	psrad     mm5,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm5,mm5
+	pi2fd     mm1,mm1
+
+	pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 01)
+	pfadd	mm7,mm3			; mm7=data1=(10 11)
+	pfsub	mm5,mm1			; mm5=data7=(70 71)
+	pfsub	mm0,mm3			; mm0=data6=(60 61)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq      mm1,mm6		; transpose coefficients
+	punpckldq mm6,mm7		; mm6=(00 10)
+	punpckhdq mm1,mm7		; mm1=(01 11)
+	movq      mm3,mm0		; transpose coefficients
+	punpckldq mm0,mm5		; mm0=(60 70)
+	punpckhdq mm3,mm5		; mm3=(61 71)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm5, MMWORD [wk(1)]	; mm5=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm6,mm7
+	movq	mm1,mm5
+	pfadd	mm7,mm2			; mm7=data2=(20 21)
+	pfadd	mm5,mm4			; mm5=data4=(40 41)
+	pfsub	mm6,mm2			; mm6=data5=(50 51)
+	pfsub	mm1,mm4			; mm1=data3=(30 31)
+
+	movq      mm0,mm7		; transpose coefficients
+	punpckldq mm7,mm1		; mm7=(20 30)
+	punpckhdq mm0,mm1		; mm0=(21 31)
+	movq      mm3,mm5		; transpose coefficients
+	punpckldq mm5,mm6		; mm5=(40 50)
+	punpckhdq mm3,mm6		; mm3=(41 51)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+	add	esi, byte 2*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 2*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 10)
+	pfadd	mm7,mm3			; mm7=data1=(01 11)
+	pfsub	mm5,mm1			; mm5=data7=(07 17)
+	pfsub	mm0,mm3			; mm0=data6=(06 16)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq	mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm3,mm3
+	psrld	mm3,WORD_BIT		; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm6,mm1			; mm6=roundint(data0/8)=(00 ** 10 **)
+	pfadd	mm7,mm1			; mm7=roundint(data1/8)=(01 ** 11 **)
+	pfadd	mm0,mm1			; mm0=roundint(data6/8)=(06 ** 16 **)
+	pfadd	mm5,mm1			; mm5=roundint(data7/8)=(07 ** 17 **)
+
+	pand	mm6,mm3			; mm6=(00 -- 10 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 01 -- 11)
+	pand	mm0,mm3			; mm0=(06 -- 16 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 07 -- 17)
+	por	mm6,mm7			; mm6=(00 01 10 11)
+	por	mm0,mm5			; mm0=(06 07 16 17)
+
+	movq	mm1, MMWORD [wk(0)]	; mm1=tmp2
+	movq	mm3, MMWORD [wk(1)]	; mm3=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm7,mm1
+	movq	mm5,mm3
+	pfadd	mm1,mm2			; mm1=data2=(02 12)
+	pfadd	mm3,mm4			; mm3=data4=(04 14)
+	pfsub	mm7,mm2			; mm7=data5=(05 15)
+	pfsub	mm5,mm4			; mm5=data3=(03 13)
+
+	movq	mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm4,mm4
+	psrld	mm4,WORD_BIT		; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm3,mm2			; mm3=roundint(data4/8)=(04 ** 14 **)
+	pfadd	mm7,mm2			; mm7=roundint(data5/8)=(05 ** 15 **)
+	pfadd	mm1,mm2			; mm1=roundint(data2/8)=(02 ** 12 **)
+	pfadd	mm5,mm2			; mm5=roundint(data3/8)=(03 ** 13 **)
+
+	pand	mm3,mm4			; mm3=(04 -- 14 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 05 -- 15)
+	pand	mm1,mm4			; mm1=(02 -- 12 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 03 -- 13)
+	por	mm3,mm7			; mm3=(04 05 14 15)
+	por	mm1,mm5			; mm1=(02 03 12 13)
+
+	movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm2=[PB_CENTERJSAMP]
+
+	packsswb  mm6,mm3		; mm6=(00 01 10 11 04 05 14 15)
+	packsswb  mm1,mm0		; mm1=(02 03 12 13 06 07 16 17)
+	paddb     mm6,mm2
+	paddb     mm1,mm2
+
+	movq      mm4,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm1		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm4,mm1		; mm4=(04 05 06 07 14 15 16 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm4		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm4		; mm7=(10 11 12 13 14 15 16 17)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 2*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 2*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jimmxfst.asm b/jpeg/simd/jimmxfst.asm
new file mode 100644
index 000000000000..3b055727d1f8
--- /dev/null
+++ b/jpeg/simd/jimmxfst.asm
@@ -0,0 +1,500 @@
+;
+; jimmxfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414	times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 01 02 03)
+	paddw	mm7,mm0			; mm7=data1=(10 11 12 13)
+	psubw	mm1,mm3			; mm1=data7=(70 71 72 73)
+	psubw	mm5,mm0			; mm5=data6=(60 61 62 63)
+	psubw	mm4,mm0			; mm4=tmp5
+
+	movq      mm3,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm3,mm7		; mm3=(02 12 03 13)
+	movq      mm0,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm1		; mm5=(60 70 61 71)
+	punpckhwd mm0,mm1		; mm0=(62 72 63 73)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm1, MMWORD [wk(1)]	; mm1=tmp3
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(60 70 61 71)
+	movq	MMWORD [wk(1)], mm0	; wk(1)=(62 72 63 73)
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm7
+	movq	mm0,mm1
+	paddw	mm7,mm4			; mm7=data2=(20 21 22 23)
+	paddw	mm1,mm2			; mm1=data4=(40 41 42 43)
+	psubw	mm5,mm4			; mm5=data5=(50 51 52 53)
+	psubw	mm0,mm2			; mm0=data3=(30 31 32 33)
+
+	movq      mm4,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm0		; mm7=(20 30 21 31)
+	punpckhwd mm4,mm0		; mm4=(22 32 23 33)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm5		; mm1=(40 50 41 51)
+	punpckhwd mm2,mm5		; mm2=(42 52 43 53)
+
+	movq      mm0,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(00 10 20 30)
+	punpckhdq mm0,mm7		; mm0=(01 11 21 31)
+	movq      mm5,mm3		; transpose coefficients(phase 2)
+	punpckldq mm3,mm4		; mm3=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(60 70 61 71)
+	movq	mm4, MMWORD [wk(1)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm7		; mm1=(40 50 60 70)
+	punpckhdq mm6,mm7		; mm6=(41 51 61 71)
+	movq      mm0,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(42 52 62 72)
+	punpckhdq mm0,mm4		; mm0=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_IFAST_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 10 20 30)
+	paddw	mm7,mm0			; mm7=data1=(01 11 21 31)
+	psraw	mm6,(PASS1_BITS+3)	; descale
+	psraw	mm7,(PASS1_BITS+3)	; descale
+	psubw	mm1,mm3			; mm1=data7=(07 17 27 37)
+	psubw	mm5,mm0			; mm5=data6=(06 16 26 36)
+	psraw	mm1,(PASS1_BITS+3)	; descale
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psubw	mm4,mm0			; mm4=tmp5
+
+	packsswb  mm6,mm5		; mm6=(00 10 20 30 06 16 26 36)
+	packsswb  mm7,mm1		; mm7=(01 11 21 31 07 17 27 37)
+
+	movq	mm3, MMWORD [wk(0)]	; mm3=tmp2
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp3
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm3
+	movq	mm1,mm0
+	paddw	mm3,mm4			; mm3=data2=(02 12 22 32)
+	paddw	mm0,mm2			; mm0=data4=(04 14 24 34)
+	psraw	mm3,(PASS1_BITS+3)	; descale
+	psraw	mm0,(PASS1_BITS+3)	; descale
+	psubw	mm5,mm4			; mm5=data5=(05 15 25 35)
+	psubw	mm1,mm2			; mm1=data3=(03 13 23 33)
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psraw	mm1,(PASS1_BITS+3)	; descale
+
+	movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm4=[PB_CENTERJSAMP]
+
+	packsswb  mm3,mm0		; mm3=(02 12 22 32 04 14 24 34)
+	packsswb  mm1,mm5		; mm1=(03 13 23 33 05 15 25 35)
+
+	paddb     mm6,mm4
+	paddb     mm7,mm4
+	paddb     mm3,mm4
+	paddb     mm1,mm4
+
+	movq      mm2,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm7		; mm6=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm7		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm0,mm3		; transpose coefficients(phase 1)
+	punpcklbw mm3,mm1		; mm3=(02 03 12 13 22 23 32 33)
+	punpckhbw mm0,mm1		; mm0=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm3		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm3		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm4,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(04 05 06 07 14 15 16 17)
+	punpckhwd mm4,mm2		; mm4=(24 25 26 27 34 35 36 37)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13 14 15 16 17)
+	movq      mm1,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm4		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm1,mm4		; mm1=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jimmxint.asm b/jpeg/simd/jimmxint.asm
new file mode 100644
index 000000000000..7b52fae34eaa
--- /dev/null
+++ b/jpeg/simd/jimmxint.asm
@@ -0,0 +1,852 @@
+;
+; jimmxint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		12
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm3=[PD_DESCALE_P1]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm5,mm7		; mm5=data0=(00 01 02 03)
+	packssdw  mm2,mm0		; mm2=data7=(70 71 72 73)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm1=[PD_DESCALE_P1]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P1
+	psrad	mm3,DESCALE_P1
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm4,mm3		; mm4=data1=(10 11 12 13)
+	packssdw  mm7,mm0		; mm7=data6=(60 61 62 63)
+
+	movq      mm6,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm4		; mm5=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm1,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm2		; mm7=(60 70 61 71)
+	punpckhwd mm1,mm2		; mm1=(62 72 63 73)
+
+	movq	mm3, MMWORD [wk(6)]	; mm3=tmp12L
+	movq	mm0, MMWORD [wk(7)]	; mm0=tmp12H
+	movq	mm4, MMWORD [wk(10)]	; mm4=tmp1L
+	movq	mm2, MMWORD [wk(11)]	; mm2=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 01 11)
+	movq	MMWORD [wk(1)], mm6	; wk(1)=(02 12 03 13)
+	movq	MMWORD [wk(4)], mm7	; wk(4)=(60 70 61 71)
+	movq	MMWORD [wk(5)], mm1	; wk(5)=(62 72 63 73)
+
+	movq	mm5,mm3
+	movq	mm6,mm0
+	paddd	mm3,mm4			; mm3=data2L
+	paddd	mm0,mm2			; mm0=data2H
+	psubd	mm5,mm4			; mm5=data5L
+	psubd	mm6,mm2			; mm6=data5H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm7=[PD_DESCALE_P1]
+
+	paddd	mm3,mm7
+	paddd	mm0,mm7
+	psrad	mm3,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm5,mm7
+	paddd	mm6,mm7
+	psrad	mm5,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm3,mm0		; mm3=data2=(20 21 22 23)
+	packssdw  mm5,mm6		; mm5=data5=(50 51 52 53)
+
+	movq	mm1, MMWORD [wk(2)]	; mm1=tmp13L
+	movq	mm4, MMWORD [wk(3)]	; mm4=tmp13H
+	movq	mm2, MMWORD [wk(8)]	; mm2=tmp0L
+	movq	mm7, MMWORD [wk(9)]	; mm7=tmp0H
+
+	movq	mm0,mm1
+	movq	mm6,mm4
+	paddd	mm1,mm2			; mm1=data3L
+	paddd	mm4,mm7			; mm4=data3H
+	psubd	mm0,mm2			; mm0=data4L
+	psubd	mm6,mm7			; mm6=data4H
+
+	movq	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm2=[PD_DESCALE_P1]
+
+	paddd	mm1,mm2
+	paddd	mm4,mm2
+	psrad	mm1,DESCALE_P1
+	psrad	mm4,DESCALE_P1
+	paddd	mm0,mm2
+	paddd	mm6,mm2
+	psrad	mm0,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm1,mm4		; mm1=data3=(30 31 32 33)
+	packssdw  mm0,mm6		; mm0=data4=(40 41 42 43)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(00 10 01 11)
+	movq	mm2, MMWORD [wk(1)]	; mm2=(02 12 03 13)
+
+	movq      mm4,mm3		; transpose coefficients(phase 1)
+	punpcklwd mm3,mm1		; mm3=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm6,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm5		; mm0=(40 50 41 51)
+	punpckhwd mm6,mm5		; mm6=(42 52 43 53)
+
+	movq      mm1,mm7		; transpose coefficients(phase 2)
+	punpckldq mm7,mm3		; mm7=(00 10 20 30)
+	punpckhdq mm1,mm3		; mm1=(01 11 21 31)
+	movq      mm5,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm3, MMWORD [wk(4)]	; mm3=(60 70 61 71)
+	movq	mm4, MMWORD [wk(5)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpckldq mm0,mm3		; mm0=(40 50 60 70)
+	punpckhdq mm7,mm3		; mm7=(41 51 61 71)
+	movq      mm1,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm4		; mm6=(42 52 62 72)
+	punpckhdq mm1,mm4		; mm1=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm3=[PD_DESCALE_P2]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm5,mm7		; mm5=data0=(00 10 20 30)
+	packssdw  mm2,mm0		; mm2=data7=(07 17 27 37)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm1=[PD_DESCALE_P2]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm4,mm3		; mm4=data1=(01 11 21 31)
+	packssdw  mm7,mm0		; mm7=data6=(06 16 26 36)
+
+	packsswb  mm5,mm7		; mm5=(00 10 20 30 06 16 26 36)
+	packsswb  mm4,mm2		; mm4=(01 11 21 31 07 17 27 37)
+
+	movq	mm6, MMWORD [wk(6)]	; mm6=tmp12L
+	movq	mm1, MMWORD [wk(7)]	; mm1=tmp12H
+	movq	mm3, MMWORD [wk(10)]	; mm3=tmp1L
+	movq	mm0, MMWORD [wk(11)]	; mm0=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 20 30 06 16 26 36)
+	movq	MMWORD [wk(1)], mm4	; wk(1)=(01 11 21 31 07 17 27 37)
+
+	movq	mm7,mm6
+	movq	mm2,mm1
+	paddd	mm6,mm3			; mm6=data2L
+	paddd	mm1,mm0			; mm1=data2H
+	psubd	mm7,mm3			; mm7=data5L
+	psubd	mm2,mm0			; mm2=data5H
+
+	movq	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm5=[PD_DESCALE_P2]
+
+	paddd	mm6,mm5
+	paddd	mm1,mm5
+	psrad	mm6,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm7,mm5
+	paddd	mm2,mm5
+	psrad	mm7,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	packssdw  mm6,mm1		; mm6=data2=(02 12 22 32)
+	packssdw  mm7,mm2		; mm7=data5=(05 15 25 35)
+
+	movq	mm4, MMWORD [wk(2)]	; mm4=tmp13L
+	movq	mm3, MMWORD [wk(3)]	; mm3=tmp13H
+	movq	mm0, MMWORD [wk(8)]	; mm0=tmp0L
+	movq	mm5, MMWORD [wk(9)]	; mm5=tmp0H
+
+	movq	mm1,mm4
+	movq	mm2,mm3
+	paddd	mm4,mm0			; mm4=data3L
+	paddd	mm3,mm5			; mm3=data3H
+	psubd	mm1,mm0			; mm1=data4L
+	psubd	mm2,mm5			; mm2=data4H
+
+	movq	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm0=[PD_DESCALE_P2]
+
+	paddd	mm4,mm0
+	paddd	mm3,mm0
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm1,mm0
+	paddd	mm2,mm0
+	psrad	mm1,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm5=[PB_CENTERJSAMP]
+
+	packssdw  mm4,mm3		; mm4=data3=(03 13 23 33)
+	packssdw  mm1,mm2		; mm1=data4=(04 14 24 34)
+
+	movq      mm0, MMWORD [wk(0)]	; mm0=(00 10 20 30 06 16 26 36)
+	movq      mm3, MMWORD [wk(1)]	; mm3=(01 11 21 31 07 17 27 37)
+
+	packsswb  mm6,mm1		; mm6=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm7		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm5
+	paddb     mm3,mm5
+	paddb     mm6,mm5
+	paddb     mm4,mm5
+
+	movq      mm2,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm3		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm3		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm1,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm4		; mm6=(02 03 12 13 22 23 32 33)
+	punpckhbw mm1,mm4		; mm1=(04 05 14 15 24 25 34 35)
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm6		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm7,mm6		; mm7=(20 21 22 23 30 31 32 33)
+	movq      mm5,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm2		; mm1=(04 05 06 07 14 15 16 17)
+	punpckhwd mm5,mm2		; mm5=(24 25 26 27 34 35 36 37)
+
+	movq      mm3,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm1		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm3,mm1		; mm3=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm7		; transpose coefficients(phase 3)
+	punpckldq mm7,mm5		; mm7=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm5		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jimmxred.asm b/jpeg/simd/jimmxred.asm
new file mode 100644
index 000000000000..a2b7103dffab
--- /dev/null
+++ b/jpeg/simd/jimmxred.asm
@@ -0,0 +1,706 @@
+;
+; jimmxred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
+PW_F256_F089	times 2 dw  F_2_562, F_0_899
+PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm0,mm1
+	packsswb mm0,mm0
+	movd	eax,mm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P1_4
+	psrad	mm2,DESCALE_P1_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
+	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P1_4
+	psrad	mm0,DESCALE_P1_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
+	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
+
+	movq      mm6,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm7,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
+	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm2		; mm1=(00 10 20 30)
+	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
+	movq      mm3,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(02 12 22 32)
+	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P2_4
+	psrad	mm2,DESCALE_P2_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
+	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P2_4
+	psrad	mm0,DESCALE_P2_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
+	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
+	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
+	paddb     mm1,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm1		; transpose coefficients(phase 1)
+	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
+	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	psrlq	mm1,4*BYTE_BIT
+	psrlq	mm0,4*BYTE_BIT
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [dct_table(ebp)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+	pcmpeqd   mm7,mm7
+	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+	movq      mm4,mm0		; mm4=(10 11 ** 13)
+	movq      mm5,mm2		; mm5=(50 51 ** 53)
+	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
+	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
+	pand	mm1,mm7			; mm1=(-- 31 -- 33)
+	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
+	pand	mm3,mm7			; mm3=(-- 71 -- 73)
+	por	mm0,mm1			; mm0=(11 31 13 33)
+	por	mm2,mm3			; mm2=(51 71 53 73)
+	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
+
+	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
+	pand	mm1,mm7			; mm1=(-- 35 -- 37)
+	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
+	pand	mm5,mm7			; mm5=(-- 75 -- 77)
+	por	mm6,mm1			; mm6=(15 35 17 37)
+	por	mm3,mm5			; mm3=(55 75 57 77)
+	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
+	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
+
+	; -- Even part
+
+	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+	movq	mm2,mm1				; mm2=(00 01 ** 03)
+	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
+	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
+
+	pand	mm2,mm7				; mm2=(-- 01 -- 03)
+	pand	mm5,mm7				; mm5=(-- 05 -- 07)
+	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
+	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
+
+	; -- Final output stage
+
+	movq      mm3,mm1
+	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
+	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
+	punpckldq mm1,mm3		; mm1=(A0 B0)
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
+
+	movq	mm4,mm2
+	movq	mm3,mm5
+	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
+	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
+	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
+	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
+
+	paddd	mm1,mm7
+	psrad	mm1,DESCALE_P1_2
+
+	paddd	mm2,mm7
+	paddd	mm5,mm7
+	psrad	mm2,DESCALE_P1_2
+	psrad	mm5,DESCALE_P1_2
+	paddd	mm4,mm7
+	paddd	mm3,mm7
+	psrad	mm4,DESCALE_P1_2
+	psrad	mm3,DESCALE_P1_2
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
+	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
+
+	; -- Even part
+
+	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
+
+	; -- Final output stage
+
+	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
+
+	movq      mm6,mm1
+	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
+	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
+
+	paddd     mm1,mm0
+	paddd     mm6,mm0
+	psrad     mm1,DESCALE_P2_2
+	psrad     mm6,DESCALE_P2_2
+
+	movq      mm7,mm1		; transpose coefficients
+	punpckldq mm1,mm6		; mm1=(C0 D0)
+	punpckhdq mm7,mm6		; mm7=(C1 D1)
+
+	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
+	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	movd	ecx,mm1
+	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
+	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2flt-64.asm b/jpeg/simd/jiss2flt-64.asm
new file mode 100644
index 000000000000..6e7e6d425ec3
--- /dev/null
+++ b/jpeg/simd/jiss2flt-64.asm
@@ -0,0 +1,483 @@
+;
+; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [workspace]
+	collect_args
+	push	rbx
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+	lea	rdi, [workspace]			; FAST_FLOAT * wsptr
+	mov	rcx, DCTSIZE/4				; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1,xmm2
+	por	xmm3,xmm4
+	por	xmm5,xmm6
+	por	xmm1,xmm3
+	por	xmm5,xmm7
+	por	xmm1,xmm5
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
+
+	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
+	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
+
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[rel PD_1_414]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
+	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
+
+	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
+	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
+	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
+	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
+	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	rsi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	rcx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	rax, [original_rbp]
+	lea	rsi, [workspace]			; FAST_FLOAT * wsptr
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+	mov	rcx, DCTSIZE/4				; ctr
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[rel PD_1_414]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
+	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[rel PD_RNDINT_MAGIC]	; xmm1=[rel PD_RNDINT_MAGIC]
+	pcmpeqd	xmm3,xmm3
+	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
+	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
+	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
+	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
+
+	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm7,xmm1
+	movaps	xmm5,xmm3
+	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
+	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
+	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
+	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
+
+	movaps	xmm2,[rel PD_RNDINT_MAGIC]	; xmm2=[rel PD_RNDINT_MAGIC]
+	pcmpeqd	xmm4,xmm4
+	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
+	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
+	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
+	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
+
+	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+	paddb     xmm6,xmm2
+	paddb     xmm1,xmm2
+
+	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
+	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+	add	rsi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	rdi, byte 4*SIZEOF_JSAMPROW
+	dec	rcx				; ctr
+	jnz	near .rowloop
+
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2flt.asm b/jpeg/simd/jiss2flt.asm
new file mode 100644
index 000000000000..17bc3633e7f4
--- /dev/null
+++ b/jpeg/simd/jiss2flt.asm
@@ -0,0 +1,498 @@
+;
+; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm2
+	por	xmm3,xmm4
+	por	xmm5,xmm6
+	por	xmm1,xmm3
+	por	xmm5,xmm7
+	por	xmm1,xmm5
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
+
+	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
+	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
+
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
+	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
+
+	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
+	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
+	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
+	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm3,xmm3
+	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
+	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
+	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
+	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
+
+	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm7,xmm1
+	movaps	xmm5,xmm3
+	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
+	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
+	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
+	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
+
+	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm4,xmm4
+	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
+	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
+	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
+	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+	paddb     xmm6,xmm2
+	paddb     xmm1,xmm2
+
+	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
+	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2fst-64.asm b/jpeg/simd/jiss2fst-64.asm
new file mode 100644
index 000000000000..088750583b40
--- /dev/null
+++ b/jpeg/simd/jiss2fst-64.asm
@@ -0,0 +1,492 @@
+;
+; jiss2fst-64.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/projecpt/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
+	jmp	near .column_end
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	psubw	xmm0,xmm2		; xmm0=tmp11
+	psubw	xmm1,xmm3
+	paddw	xmm4,xmm2		; xmm4=tmp10
+	paddw	xmm5,xmm3		; xmm5=tmp13
+
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm1,[rel PW_F1414]
+	psubw	xmm1,xmm5		; xmm1=tmp12
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm0
+	psubw	xmm4,xmm5		; xmm4=tmp3
+	psubw	xmm0,xmm1		; xmm0=tmp2
+	paddw	xmm6,xmm5		; xmm6=tmp0
+	paddw	xmm7,xmm1		; xmm7=tmp1
+
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm0,xmm5
+	psubw	xmm2,xmm1		; xmm2=z12
+	psubw	xmm5,xmm3		; xmm5=z10
+	paddw	xmm4,xmm1		; xmm4=z11
+	paddw	xmm0,xmm3		; xmm0=z13
+
+	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm3,xmm4
+	psubw	xmm4,xmm0
+	paddw	xmm3,xmm0		; xmm3=tmp7
+
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm4,[rel PW_F1414]	; xmm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm0,xmm5
+	paddw	xmm5,xmm2
+	pmulhw	xmm5,[rel PW_F1847]	; xmm5=z5
+	pmulhw	xmm0,[rel PW_MF1613]
+	pmulhw	xmm2,[rel PW_F1082]
+	psubw	xmm0,xmm1
+	psubw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm0,xmm5		; xmm0=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm0,xmm3		; xmm0=tmp6
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm7
+	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
+	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
+	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
+	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
+	psubw	xmm4,xmm0		; xmm4=tmp5
+
+	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
+	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
+
+	paddw	xmm2,xmm4		; xmm2=tmp4
+	movdqa	xmm5,xmm7
+	movdqa	xmm0,xmm1
+	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
+	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
+	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
+	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
+	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
+
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
+	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
+
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
+	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	rax, [original_rbp]
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; -- Even part
+
+	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm0,xmm5
+	psubw	xmm6,xmm1		; xmm6=tmp11
+	psubw	xmm5,xmm3
+	paddw	xmm2,xmm1		; xmm2=tmp10
+	paddw	xmm0,xmm3		; xmm0=tmp13
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[rel PW_F1414]
+	psubw	xmm5,xmm0		; xmm5=tmp12
+
+	movdqa	xmm1,xmm2
+	movdqa	xmm3,xmm6
+	psubw	xmm2,xmm0		; xmm2=tmp3
+	psubw	xmm6,xmm5		; xmm6=tmp2
+	paddw	xmm1,xmm0		; xmm1=tmp0
+	paddw	xmm3,xmm5		; xmm3=tmp1
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
+
+	; -- Odd part
+
+	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	psubw	xmm0,xmm7		; xmm0=z12
+	psubw	xmm4,xmm5		; xmm4=z10
+	paddw	xmm2,xmm7		; xmm2=z11
+	paddw	xmm6,xmm5		; xmm6=z13
+
+	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm5,xmm2
+	psubw	xmm2,xmm6
+	paddw	xmm5,xmm6		; xmm5=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm2,[rel PW_F1414]	; xmm2=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm6,xmm4
+	paddw	xmm4,xmm0
+	pmulhw	xmm4,[rel PW_F1847]	; xmm4=z5
+	pmulhw	xmm6,[rel PW_MF1613]
+	pmulhw	xmm0,[rel PW_F1082]
+	psubw	xmm6,xmm7
+	psubw	xmm0,xmm4		; xmm0=tmp10
+	paddw	xmm6,xmm4		; xmm6=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm6,xmm5		; xmm6=tmp6
+	movdqa	xmm7,xmm1
+	movdqa	xmm4,xmm3
+	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
+	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	psraw	xmm1,(PASS1_BITS+3)	; descale
+	psraw	xmm3,(PASS1_BITS+3)	; descale
+	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
+	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psubw	xmm2,xmm6		; xmm2=tmp5
+
+	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
+
+	paddw	xmm0,xmm2		; xmm0=tmp4
+	movdqa	xmm4,xmm5
+	movdqa	xmm7,xmm6
+	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
+	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
+	psraw	xmm5,(PASS1_BITS+3)	; descale
+	psraw	xmm6,(PASS1_BITS+3)	; descale
+	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
+	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+
+	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
+
+	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm1,xmm2
+	paddb     xmm3,xmm2
+	paddb     xmm5,xmm2
+	paddb     xmm7,xmm2
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
+	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
+	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
+	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
+	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2fst.asm b/jpeg/simd/jiss2fst.asm
new file mode 100644
index 000000000000..b53664d7ca58
--- /dev/null
+++ b/jpeg/simd/jiss2fst.asm
@@ -0,0 +1,502 @@
+;
+; jiss2fst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	psubw	xmm0,xmm2		; xmm0=tmp11
+	psubw	xmm1,xmm3
+	paddw	xmm4,xmm2		; xmm4=tmp10
+	paddw	xmm5,xmm3		; xmm5=tmp13
+
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm1,xmm5		; xmm1=tmp12
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm0
+	psubw	xmm4,xmm5		; xmm4=tmp3
+	psubw	xmm0,xmm1		; xmm0=tmp2
+	paddw	xmm6,xmm5		; xmm6=tmp0
+	paddw	xmm7,xmm1		; xmm7=tmp1
+
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm0,xmm5
+	psubw	xmm2,xmm1		; xmm2=z12
+	psubw	xmm5,xmm3		; xmm5=z10
+	paddw	xmm4,xmm1		; xmm4=z11
+	paddw	xmm0,xmm3		; xmm0=z13
+
+	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm3,xmm4
+	psubw	xmm4,xmm0
+	paddw	xmm3,xmm0		; xmm3=tmp7
+
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1414)]	; xmm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm0,xmm5
+	paddw	xmm5,xmm2
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1847)]	; xmm5=z5
+	pmulhw	xmm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm0,xmm1
+	psubw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm0,xmm5		; xmm0=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm0,xmm3		; xmm0=tmp6
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm7
+	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
+	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
+	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
+	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
+	psubw	xmm4,xmm0		; xmm4=tmp5
+
+	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
+	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
+
+	paddw	xmm2,xmm4		; xmm2=tmp4
+	movdqa	xmm5,xmm7
+	movdqa	xmm0,xmm1
+	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
+	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
+	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
+	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
+	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
+
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
+	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
+
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
+	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm0,xmm5
+	psubw	xmm6,xmm1		; xmm6=tmp11
+	psubw	xmm5,xmm3
+	paddw	xmm2,xmm1		; xmm2=tmp10
+	paddw	xmm0,xmm3		; xmm0=tmp13
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm5,xmm0		; xmm5=tmp12
+
+	movdqa	xmm1,xmm2
+	movdqa	xmm3,xmm6
+	psubw	xmm2,xmm0		; xmm2=tmp3
+	psubw	xmm6,xmm5		; xmm6=tmp2
+	paddw	xmm1,xmm0		; xmm1=tmp0
+	paddw	xmm3,xmm5		; xmm3=tmp1
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
+
+	; -- Odd part
+
+	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	psubw	xmm0,xmm7		; xmm0=z12
+	psubw	xmm4,xmm5		; xmm4=z10
+	paddw	xmm2,xmm7		; xmm2=z11
+	paddw	xmm6,xmm5		; xmm6=z13
+
+	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm5,xmm2
+	psubw	xmm2,xmm6
+	paddw	xmm5,xmm6		; xmm5=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1414)]	; xmm2=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm6,xmm4
+	paddw	xmm4,xmm0
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1847)]	; xmm4=z5
+	pmulhw	xmm6,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm6,xmm7
+	psubw	xmm0,xmm4		; xmm0=tmp10
+	paddw	xmm6,xmm4		; xmm6=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm6,xmm5		; xmm6=tmp6
+	movdqa	xmm7,xmm1
+	movdqa	xmm4,xmm3
+	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
+	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	psraw	xmm1,(PASS1_BITS+3)	; descale
+	psraw	xmm3,(PASS1_BITS+3)	; descale
+	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
+	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psubw	xmm2,xmm6		; xmm2=tmp5
+
+	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
+
+	paddw	xmm0,xmm2		; xmm0=tmp4
+	movdqa	xmm4,xmm5
+	movdqa	xmm7,xmm6
+	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
+	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
+	psraw	xmm5,(PASS1_BITS+3)	; descale
+	psraw	xmm6,(PASS1_BITS+3)	; descale
+	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
+	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm1,xmm2
+	paddb     xmm3,xmm2
+	paddb     xmm5,xmm2
+	paddb     xmm7,xmm2
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
+	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
+	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
+	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
+	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2int-64.asm b/jpeg/simd/jiss2int-64.asm
new file mode 100644
index 000000000000..13764d6ae823
--- /dev/null
+++ b/jpeg/simd/jiss2int-64.asm
@@ -0,0 +1,848 @@
+;
+; jiss2int-64.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		12
+
+	align	16
+	global	EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm5,PASS1_BITS
+
+	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+	jmp	near .column_end
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm4,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm4,xmm3		; xmm3=in6=z3
+	punpckhwd xmm5,xmm3
+	movdqa    xmm1,xmm4
+	movdqa    xmm3,xmm5
+	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=tmp3L
+	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
+	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
+	pmaddwd   xmm3,[rel PW_F054_MF130]	; xmm3=tmp2H
+
+	movdqa    xmm6,xmm0
+	paddw     xmm0,xmm2		; xmm0=in0+in4
+	psubw     xmm6,xmm2		; xmm6=in0-in4
+
+	pxor      xmm7,xmm7
+	pxor      xmm2,xmm2
+	punpcklwd xmm7,xmm0		; xmm7=tmp0L
+	punpckhwd xmm2,xmm0		; xmm2=tmp0H
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm4		; xmm7=tmp10L
+	psubd	xmm0,xmm4		; xmm0=tmp13L
+	movdqa	xmm4,xmm2
+	paddd	xmm2,xmm5		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm7,xmm7
+	punpcklwd xmm5,xmm6		; xmm5=tmp1L
+	punpckhwd xmm7,xmm6		; xmm7=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+	movdqa	xmm2,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm2,xmm1		; xmm2=tmp12L
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm3		; xmm7=tmp11H
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm7,xmm4
+	paddw	xmm5,xmm3		; xmm5=z3
+	paddw	xmm7,xmm1		; xmm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm2,xmm5
+	movdqa    xmm0,xmm5
+	punpcklwd xmm2,xmm7
+	punpckhwd xmm0,xmm7
+	movdqa    xmm5,xmm2
+	movdqa    xmm7,xmm0
+	pmaddwd   xmm2,[rel PW_MF078_F117]	; xmm2=z3L
+	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3H
+	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
+	pmaddwd   xmm7,[rel PW_F117_F078]	; xmm7=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm2,xmm3
+	movdqa    xmm0,xmm3
+	punpcklwd xmm2,xmm4
+	punpckhwd xmm0,xmm4
+	movdqa    xmm3,xmm2
+	movdqa    xmm4,xmm0
+	pmaddwd   xmm2,[rel PW_MF060_MF089]	; xmm2=tmp0L
+	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0H
+	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3L
+	pmaddwd   xmm4,[rel PW_MF089_F060]	; xmm4=tmp3H
+
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
+	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
+	paddd	xmm3,xmm5		; xmm3=tmp3L
+	paddd	xmm4,xmm7		; xmm4=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
+
+	movdqa    xmm2,xmm1
+	movdqa    xmm0,xmm1
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm0,xmm6
+	movdqa    xmm1,xmm2
+	movdqa    xmm6,xmm0
+	pmaddwd   xmm2,[rel PW_MF050_MF256]	; xmm2=tmp1L
+	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1H
+	pmaddwd   xmm1,[rel PW_MF256_F050]	; xmm1=tmp2L
+	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
+
+	paddd	xmm2,xmm5		; xmm2=tmp1L
+	paddd	xmm0,xmm7		; xmm0=tmp1H
+	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm0,xmm7
+	paddd	xmm5,xmm3		; xmm5=data0L
+	paddd	xmm7,xmm4		; xmm7=data0H
+	psubd	xmm2,xmm3		; xmm2=data7L
+	psubd	xmm0,xmm4		; xmm0=data7H
+
+	movdqa	xmm3,[rel PD_DESCALE_P1]	; xmm3=[rel PD_DESCALE_P1]
+
+	paddd	xmm5,xmm3
+	paddd	xmm7,xmm3
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm7,DESCALE_P1
+	paddd	xmm2,xmm3
+	paddd	xmm0,xmm3
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
+	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
+
+	movdqa	xmm7,xmm4
+	movdqa	xmm0,xmm3
+	paddd	xmm4,xmm1		; xmm4=data1L
+	paddd	xmm3,xmm6		; xmm3=data1H
+	psubd	xmm7,xmm1		; xmm7=data6L
+	psubd	xmm0,xmm6		; xmm0=data6H
+
+	movdqa	xmm1,[rel PD_DESCALE_P1]	; xmm1=[rel PD_DESCALE_P1]
+
+	paddd	xmm4,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+	paddd	xmm7,xmm1
+	paddd	xmm0,xmm1
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
+	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
+	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
+	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm5,xmm3
+	movdqa	xmm6,xmm0
+	paddd	xmm3,xmm4		; xmm3=data2L
+	paddd	xmm0,xmm2		; xmm0=data2H
+	psubd	xmm5,xmm4		; xmm5=data5L
+	psubd	xmm6,xmm2		; xmm6=data5H
+
+	movdqa	xmm7,[rel PD_DESCALE_P1]	; xmm7=[rel PD_DESCALE_P1]
+
+	paddd	xmm3,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm3,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
+	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
+	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
+	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
+	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
+
+	movdqa	xmm0,xmm1
+	movdqa	xmm6,xmm4
+	paddd	xmm1,xmm2		; xmm1=data3L
+	paddd	xmm4,xmm7		; xmm4=data3H
+	psubd	xmm0,xmm2		; xmm0=data4L
+	psubd	xmm6,xmm7		; xmm6=data4H
+
+	movdqa	xmm2,[rel PD_DESCALE_P1]	; xmm2=[rel PD_DESCALE_P1]
+
+	paddd	xmm1,xmm2
+	paddd	xmm4,xmm2
+	psrad	xmm1,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm0,xmm2
+	paddd	xmm6,xmm2
+	psrad	xmm0,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
+	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
+	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
+
+	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
+	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
+	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
+
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	rax, [original_rbp]
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; -- Even part
+
+	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm6,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm6,xmm2		; xmm2=in6=z3
+	punpckhwd xmm5,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm2,xmm5
+	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=tmp3L
+	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
+	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
+	pmaddwd   xmm2,[rel PW_F054_MF130]	; xmm2=tmp2H
+
+	movdqa    xmm3,xmm7
+	paddw     xmm7,xmm0		; xmm7=in0+in4
+	psubw     xmm3,xmm0		; xmm3=in0-in4
+
+	pxor      xmm4,xmm4
+	pxor      xmm0,xmm0
+	punpcklwd xmm4,xmm7		; xmm4=tmp0L
+	punpckhwd xmm0,xmm7		; xmm0=tmp0H
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm6		; xmm4=tmp10L
+	psubd	xmm7,xmm6		; xmm7=tmp13L
+	movdqa	xmm6,xmm0
+	paddd	xmm0,xmm5		; xmm0=tmp10H
+	psubd	xmm6,xmm5		; xmm6=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm4,xmm4
+	punpcklwd xmm5,xmm3		; xmm5=tmp1L
+	punpckhwd xmm4,xmm3		; xmm4=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+	movdqa	xmm0,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm0,xmm1		; xmm0=tmp12L
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm2		; xmm4=tmp11H
+	psubd	xmm7,xmm2		; xmm7=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
+	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
+	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
+	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm4,xmm3
+	paddw	xmm5,xmm1		; xmm5=z3
+	paddw	xmm4,xmm2		; xmm4=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm0,xmm5
+	movdqa    xmm7,xmm5
+	punpcklwd xmm0,xmm4
+	punpckhwd xmm7,xmm4
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm7
+	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3L
+	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3H
+	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
+	pmaddwd   xmm4,[rel PW_F117_F078]	; xmm4=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm0,xmm1
+	movdqa    xmm7,xmm1
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm1,xmm0
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0L
+	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp0H
+	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp3L
+	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3H
+
+	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
+	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
+	paddd	xmm1,xmm5		; xmm1=tmp3L
+	paddd	xmm3,xmm4		; xmm3=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
+
+	movdqa    xmm0,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm0,xmm6
+	punpckhwd xmm7,xmm6
+	movdqa    xmm2,xmm0
+	movdqa    xmm6,xmm7
+	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1L
+	pmaddwd   xmm7,[rel PW_MF050_MF256]	; xmm7=tmp1H
+	pmaddwd   xmm2,[rel PW_MF256_F050]	; xmm2=tmp2L
+	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
+
+	paddd	xmm0,xmm5		; xmm0=tmp1L
+	paddd	xmm7,xmm4		; xmm7=tmp1H
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm7,xmm4
+	paddd	xmm5,xmm1		; xmm5=data0L
+	paddd	xmm4,xmm3		; xmm4=data0H
+	psubd	xmm0,xmm1		; xmm0=data7L
+	psubd	xmm7,xmm3		; xmm7=data7H
+
+	movdqa	xmm1,[rel PD_DESCALE_P2]	; xmm1=[rel PD_DESCALE_P2]
+
+	paddd	xmm5,xmm1
+	paddd	xmm4,xmm1
+	psrad	xmm5,DESCALE_P2
+	psrad	xmm4,DESCALE_P2
+	paddd	xmm0,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
+	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
+	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm7,xmm1
+	paddd	xmm3,xmm2		; xmm3=data1L
+	paddd	xmm1,xmm6		; xmm1=data1H
+	psubd	xmm4,xmm2		; xmm4=data6L
+	psubd	xmm7,xmm6		; xmm7=data6H
+
+	movdqa	xmm2,[rel PD_DESCALE_P2]	; xmm2=[rel PD_DESCALE_P2]
+
+	paddd	xmm3,xmm2
+	paddd	xmm1,xmm2
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm4,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
+	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
+	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm4,xmm6
+	movdqa	xmm0,xmm2
+	paddd	xmm6,xmm1		; xmm6=data2L
+	paddd	xmm2,xmm7		; xmm2=data2H
+	psubd	xmm4,xmm1		; xmm4=data5L
+	psubd	xmm0,xmm7		; xmm0=data5H
+
+	movdqa	xmm5,[rel PD_DESCALE_P2]	; xmm5=[rel PD_DESCALE_P2]
+
+	paddd	xmm6,xmm5
+	paddd	xmm2,xmm5
+	psrad	xmm6,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm4,xmm5
+	paddd	xmm0,xmm5
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
+	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
+	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
+	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
+	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
+
+	movdqa	xmm2,xmm3
+	movdqa	xmm0,xmm1
+	paddd	xmm3,xmm7		; xmm3=data3L
+	paddd	xmm1,xmm5		; xmm1=data3H
+	psubd	xmm2,xmm7		; xmm2=data4L
+	psubd	xmm0,xmm5		; xmm0=data4H
+
+	movdqa	xmm7,[rel PD_DESCALE_P2]	; xmm7=[rel PD_DESCALE_P2]
+
+	paddd	xmm3,xmm7
+	paddd	xmm1,xmm7
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm2,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm2,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	movdqa    xmm5,[rel PB_CENTERJSAMP]	; xmm5=[rel PB_CENTERJSAMP]
+
+	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
+	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm7,xmm5
+	paddb     xmm1,xmm5
+	paddb     xmm6,xmm5
+	paddb     xmm3,xmm5
+
+	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
+	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
+	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
+	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
+	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
+	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2int.asm b/jpeg/simd/jiss2int.asm
new file mode 100644
index 000000000000..adf39fb3afe7
--- /dev/null
+++ b/jpeg/simd/jiss2int.asm
@@ -0,0 +1,859 @@
+;
+; jiss2int.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		12
+
+	align	16
+	global	EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm5,PASS1_BITS
+
+	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm4,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm4,xmm3		; xmm3=in6=z3
+	punpckhwd xmm5,xmm3
+	movdqa    xmm1,xmm4
+	movdqa    xmm3,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]	; xmm3=tmp2H
+
+	movdqa    xmm6,xmm0
+	paddw     xmm0,xmm2		; xmm0=in0+in4
+	psubw     xmm6,xmm2		; xmm6=in0-in4
+
+	pxor      xmm7,xmm7
+	pxor      xmm2,xmm2
+	punpcklwd xmm7,xmm0		; xmm7=tmp0L
+	punpckhwd xmm2,xmm0		; xmm2=tmp0H
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm4		; xmm7=tmp10L
+	psubd	xmm0,xmm4		; xmm0=tmp13L
+	movdqa	xmm4,xmm2
+	paddd	xmm2,xmm5		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm7,xmm7
+	punpcklwd xmm5,xmm6		; xmm5=tmp1L
+	punpckhwd xmm7,xmm6		; xmm7=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+	movdqa	xmm2,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm2,xmm1		; xmm2=tmp12L
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm3		; xmm7=tmp11H
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm7,xmm4
+	paddw	xmm5,xmm3		; xmm5=z3
+	paddw	xmm7,xmm1		; xmm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm2,xmm5
+	movdqa    xmm0,xmm5
+	punpcklwd xmm2,xmm7
+	punpckhwd xmm0,xmm7
+	movdqa    xmm5,xmm2
+	movdqa    xmm7,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]	; xmm2=z3L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]	; xmm7=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm2,xmm3
+	movdqa    xmm0,xmm3
+	punpcklwd xmm2,xmm4
+	punpckhwd xmm0,xmm4
+	movdqa    xmm3,xmm2
+	movdqa    xmm4,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm2=tmp0L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]	; xmm4=tmp3H
+
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
+	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
+	paddd	xmm3,xmm5		; xmm3=tmp3L
+	paddd	xmm4,xmm7		; xmm4=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
+
+	movdqa    xmm2,xmm1
+	movdqa    xmm0,xmm1
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm0,xmm6
+	movdqa    xmm1,xmm2
+	movdqa    xmm6,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm2=tmp1L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]	; xmm1=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm2,xmm5		; xmm2=tmp1L
+	paddd	xmm0,xmm7		; xmm0=tmp1H
+	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm0,xmm7
+	paddd	xmm5,xmm3		; xmm5=data0L
+	paddd	xmm7,xmm4		; xmm7=data0H
+	psubd	xmm2,xmm3		; xmm2=data7L
+	psubd	xmm0,xmm4		; xmm0=data7H
+
+	movdqa	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm3=[PD_DESCALE_P1]
+
+	paddd	xmm5,xmm3
+	paddd	xmm7,xmm3
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm7,DESCALE_P1
+	paddd	xmm2,xmm3
+	paddd	xmm0,xmm3
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
+	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
+
+	movdqa	xmm7,xmm4
+	movdqa	xmm0,xmm3
+	paddd	xmm4,xmm1		; xmm4=data1L
+	paddd	xmm3,xmm6		; xmm3=data1H
+	psubd	xmm7,xmm1		; xmm7=data6L
+	psubd	xmm0,xmm6		; xmm0=data6H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm1=[PD_DESCALE_P1]
+
+	paddd	xmm4,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+	paddd	xmm7,xmm1
+	paddd	xmm0,xmm1
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
+	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
+	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
+	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm5,xmm3
+	movdqa	xmm6,xmm0
+	paddd	xmm3,xmm4		; xmm3=data2L
+	paddd	xmm0,xmm2		; xmm0=data2H
+	psubd	xmm5,xmm4		; xmm5=data5L
+	psubd	xmm6,xmm2		; xmm6=data5H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm7=[PD_DESCALE_P1]
+
+	paddd	xmm3,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm3,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
+	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
+	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
+	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
+	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
+
+	movdqa	xmm0,xmm1
+	movdqa	xmm6,xmm4
+	paddd	xmm1,xmm2		; xmm1=data3L
+	paddd	xmm4,xmm7		; xmm4=data3H
+	psubd	xmm0,xmm2		; xmm0=data4L
+	psubd	xmm6,xmm7		; xmm6=data4H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm2=[PD_DESCALE_P1]
+
+	paddd	xmm1,xmm2
+	paddd	xmm4,xmm2
+	psrad	xmm1,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm0,xmm2
+	paddd	xmm6,xmm2
+	psrad	xmm0,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
+	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
+	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
+
+	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
+	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
+	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
+
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm6,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm6,xmm2		; xmm2=in6=z3
+	punpckhwd xmm5,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm2,xmm5
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]	; xmm2=tmp2H
+
+	movdqa    xmm3,xmm7
+	paddw     xmm7,xmm0		; xmm7=in0+in4
+	psubw     xmm3,xmm0		; xmm3=in0-in4
+
+	pxor      xmm4,xmm4
+	pxor      xmm0,xmm0
+	punpcklwd xmm4,xmm7		; xmm4=tmp0L
+	punpckhwd xmm0,xmm7		; xmm0=tmp0H
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm6		; xmm4=tmp10L
+	psubd	xmm7,xmm6		; xmm7=tmp13L
+	movdqa	xmm6,xmm0
+	paddd	xmm0,xmm5		; xmm0=tmp10H
+	psubd	xmm6,xmm5		; xmm6=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm4,xmm4
+	punpcklwd xmm5,xmm3		; xmm5=tmp1L
+	punpckhwd xmm4,xmm3		; xmm4=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+	movdqa	xmm0,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm0,xmm1		; xmm0=tmp12L
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm2		; xmm4=tmp11H
+	psubd	xmm7,xmm2		; xmm7=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
+	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
+	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
+	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm4,xmm3
+	paddw	xmm5,xmm1		; xmm5=z3
+	paddw	xmm4,xmm2		; xmm4=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm0,xmm5
+	movdqa    xmm7,xmm5
+	punpcklwd xmm0,xmm4
+	punpckhwd xmm7,xmm4
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]	; xmm4=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm0,xmm1
+	movdqa    xmm7,xmm1
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm1,xmm0
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp0H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp3L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3H
+
+	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
+	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
+	paddd	xmm1,xmm5		; xmm1=tmp3L
+	paddd	xmm3,xmm4		; xmm3=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
+
+	movdqa    xmm0,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm0,xmm6
+	punpckhwd xmm7,xmm6
+	movdqa    xmm2,xmm0
+	movdqa    xmm6,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm7=tmp1H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]	; xmm2=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm0,xmm5		; xmm0=tmp1L
+	paddd	xmm7,xmm4		; xmm7=tmp1H
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm7,xmm4
+	paddd	xmm5,xmm1		; xmm5=data0L
+	paddd	xmm4,xmm3		; xmm4=data0H
+	psubd	xmm0,xmm1		; xmm0=data7L
+	psubd	xmm7,xmm3		; xmm7=data7H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm1=[PD_DESCALE_P2]
+
+	paddd	xmm5,xmm1
+	paddd	xmm4,xmm1
+	psrad	xmm5,DESCALE_P2
+	psrad	xmm4,DESCALE_P2
+	paddd	xmm0,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
+	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
+	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm7,xmm1
+	paddd	xmm3,xmm2		; xmm3=data1L
+	paddd	xmm1,xmm6		; xmm1=data1H
+	psubd	xmm4,xmm2		; xmm4=data6L
+	psubd	xmm7,xmm6		; xmm7=data6H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm2=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm2
+	paddd	xmm1,xmm2
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm4,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
+	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
+	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm4,xmm6
+	movdqa	xmm0,xmm2
+	paddd	xmm6,xmm1		; xmm6=data2L
+	paddd	xmm2,xmm7		; xmm2=data2H
+	psubd	xmm4,xmm1		; xmm4=data5L
+	psubd	xmm0,xmm7		; xmm0=data5H
+
+	movdqa	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm5=[PD_DESCALE_P2]
+
+	paddd	xmm6,xmm5
+	paddd	xmm2,xmm5
+	psrad	xmm6,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm4,xmm5
+	paddd	xmm0,xmm5
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
+	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
+	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
+	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
+	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
+
+	movdqa	xmm2,xmm3
+	movdqa	xmm0,xmm1
+	paddd	xmm3,xmm7		; xmm3=data3L
+	paddd	xmm1,xmm5		; xmm1=data3H
+	psubd	xmm2,xmm7		; xmm2=data4L
+	psubd	xmm0,xmm5		; xmm0=data4H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm7=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm7
+	paddd	xmm1,xmm7
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm2,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm2,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm5=[PB_CENTERJSAMP]
+
+	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
+	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm7,xmm5
+	paddb     xmm1,xmm5
+	paddb     xmm6,xmm5
+	paddb     xmm3,xmm5
+
+	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
+	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
+	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
+	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
+	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
+	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2red-64.asm b/jpeg/simd/jiss2red-64.asm
new file mode 100644
index 000000000000..6807f17ce3f1
--- /dev/null
+++ b/jpeg/simd/jiss2red-64.asm
@@ -0,0 +1,576 @@
+;
+; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
+PW_F256_F089	times 4 dw  F_2_562, F_0_899
+PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0,xmm1
+	packsswb xmm0,xmm0
+	packsswb xmm0,xmm0
+	movd	eax,xmm0
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm0,PASS1_BITS
+
+	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+	jmp	near .column_end
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm5,xmm0
+	punpcklwd xmm4,xmm1
+	punpckhwd xmm5,xmm1
+	movdqa    xmm0,xmm4
+	movdqa    xmm1,xmm5
+	pmaddwd   xmm4,[rel PW_F256_F089]	; xmm4=(tmp2L)
+	pmaddwd   xmm5,[rel PW_F256_F089]	; xmm5=(tmp2H)
+	pmaddwd   xmm0,[rel PW_F106_MF217]	; xmm0=(tmp0L)
+	pmaddwd   xmm1,[rel PW_F106_MF217]	; xmm1=(tmp0H)
+
+	movdqa    xmm6,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm6,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm2,xmm6
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2L)
+	pmaddwd   xmm7,[rel PW_MF060_MF050]	; xmm7=(tmp2H)
+	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0L)
+	pmaddwd   xmm3,[rel PW_F145_MF021]	; xmm3=(tmp0H)
+
+	paddd	xmm6,xmm4		; xmm6=tmp2L
+	paddd	xmm7,xmm5		; xmm7=tmp2H
+	paddd	xmm2,xmm0		; xmm2=tmp0L
+	paddd	xmm3,xmm1		; xmm3=tmp0H
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      xmm1,xmm1
+	pxor      xmm2,xmm2
+	punpcklwd xmm1,xmm4		; xmm1=tmp0L
+	punpckhwd xmm2,xmm4		; xmm2=tmp0H
+	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+	movdqa    xmm3,xmm5		; xmm5=in2=z2
+	punpcklwd xmm5,xmm0		; xmm0=in6=z3
+	punpckhwd xmm3,xmm0
+	pmaddwd   xmm5,[rel PW_F184_MF076]	; xmm5=tmp2L
+	pmaddwd   xmm3,[rel PW_F184_MF076]	; xmm3=tmp2H
+
+	movdqa	xmm4,xmm1
+	movdqa	xmm0,xmm2
+	paddd	xmm1,xmm5		; xmm1=tmp10L
+	paddd	xmm2,xmm3		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp12L
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	; -- Final output stage
+
+	movdqa	xmm5,xmm1
+	movdqa	xmm3,xmm2
+	paddd	xmm1,xmm6		; xmm1=data0L
+	paddd	xmm2,xmm7		; xmm2=data0H
+	psubd	xmm5,xmm6		; xmm5=data3L
+	psubd	xmm3,xmm7		; xmm3=data3H
+
+	movdqa	xmm6,[rel PD_DESCALE_P1_4]	; xmm6=[rel PD_DESCALE_P1_4]
+
+	paddd	xmm1,xmm6
+	paddd	xmm2,xmm6
+	psrad	xmm1,DESCALE_P1_4
+	psrad	xmm2,DESCALE_P1_4
+	paddd	xmm5,xmm6
+	paddd	xmm3,xmm6
+	psrad	xmm5,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
+
+	movdqa	xmm2,xmm4
+	movdqa	xmm3,xmm0
+	paddd	xmm4,xmm7		; xmm4=data1L
+	paddd	xmm0,xmm6		; xmm0=data1H
+	psubd	xmm2,xmm7		; xmm2=data2L
+	psubd	xmm3,xmm6		; xmm3=data2H
+
+	movdqa	xmm7,[rel PD_DESCALE_P1_4]	; xmm7=[rel PD_DESCALE_P1_4]
+
+	paddd	xmm4,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm4,DESCALE_P1_4
+	psrad	xmm0,DESCALE_P1_4
+	paddd	xmm2,xmm7
+	paddd	xmm3,xmm7
+	psrad	xmm2,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	rax, [original_rbp]
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; -- Even part
+
+	pxor      xmm4,xmm4
+	punpcklwd xmm4,xmm1		; xmm4=tmp0
+	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+	; -- Odd part
+
+	punpckhwd xmm1,xmm0
+	punpckhwd xmm6,xmm3
+	movdqa    xmm5,xmm1
+	movdqa    xmm2,xmm6
+	pmaddwd   xmm1,[rel PW_F256_F089]	; xmm1=(tmp2)
+	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2)
+	pmaddwd   xmm5,[rel PW_F106_MF217]	; xmm5=(tmp0)
+	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0)
+
+	paddd     xmm6,xmm1		; xmm6=tmp2
+	paddd     xmm2,xmm5		; xmm2=tmp0
+
+	; -- Even part
+
+	punpcklwd xmm0,xmm3
+	pmaddwd   xmm0,[rel PW_F184_MF076]	; xmm0=tmp2
+
+	movdqa    xmm7,xmm4
+	paddd     xmm4,xmm0		; xmm4=tmp10
+	psubd     xmm7,xmm0		; xmm7=tmp12
+
+	; -- Final output stage
+
+	movdqa	xmm1,[rel PD_DESCALE_P2_4]	; xmm1=[rel PD_DESCALE_P2_4]
+
+	movdqa	xmm5,xmm4
+	movdqa	xmm3,xmm7
+	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
+	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
+	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
+	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+
+	paddd	xmm4,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm4,DESCALE_P2_4
+	psrad	xmm7,DESCALE_P2_4
+	paddd	xmm5,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm5,DESCALE_P2_4
+	psrad	xmm3,DESCALE_P2_4
+
+	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
+	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
+
+	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
+	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
+
+	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+	paddb     xmm4,[rel PB_CENTERJSAMP]
+
+	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+	mov	rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+	align	16
+	global	EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+	push	rbx
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+	pcmpeqd   xmm7,xmm7
+	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
+	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
+	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
+	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
+	pmaddwd   xmm4,[rel PW_F362_MF127]
+	pmaddwd   xmm5,[rel PW_F085_MF072]
+
+	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
+	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
+	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
+	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
+	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
+	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
+	pmaddwd	xmm0,[rel PW_F362_MF127]
+	pmaddwd	xmm2,[rel PW_F085_MF072]
+
+	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
+	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
+
+	; -- Even part
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
+	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
+	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
+	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+	; -- Final output stage
+
+	movdqa	xmm3,xmm6
+	movdqa	xmm5,xmm1
+	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+	movdqa	xmm2,[rel PD_DESCALE_P1_2]	; xmm2=[rel PD_DESCALE_P1_2]
+
+	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
+
+	movdqa     xmm7,xmm1
+	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
+	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
+
+	paddd	xmm6,xmm2
+	psrad	xmm6,DESCALE_P1_2
+
+	paddd	xmm1,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm1,DESCALE_P1_2
+	psrad	xmm7,DESCALE_P1_2
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+	pmaddwd   xmm1,[rel PW_F362_MF127]
+	pmaddwd   xmm7,[rel PW_F085_MF072]
+
+	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
+
+	; -- Even part
+
+	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
+
+	; -- Final output stage
+
+	movdqa    xmm4,xmm6
+	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
+
+	paddd     xmm6,[rel PD_DESCALE_P2_2]
+	psrad     xmm6,DESCALE_P2_2
+
+	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+	paddb     xmm6,[rel PB_CENTERJSAMP]
+
+	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
+	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	WORD [rdx+rax*SIZEOF_JSAMPLE], bx
+	mov	WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+
+	pop	rbx
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jiss2red.asm b/jpeg/simd/jiss2red.asm
new file mode 100644
index 000000000000..238c61d07c1f
--- /dev/null
+++ b/jpeg/simd/jiss2red.asm
@@ -0,0 +1,594 @@
+;
+; jiss2red.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
+PW_F256_F089	times 4 dw  F_2_562, F_0_899
+PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm0,xmm1
+	packsswb xmm0,xmm0
+	packsswb xmm0,xmm0
+	movd	eax,xmm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm0,PASS1_BITS
+
+	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm5,xmm0
+	punpcklwd xmm4,xmm1
+	punpckhwd xmm5,xmm1
+	movdqa    xmm0,xmm4
+	movdqa    xmm1,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]	; xmm4=(tmp2L)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]	; xmm5=(tmp2H)
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]	; xmm0=(tmp0L)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]	; xmm1=(tmp0H)
+
+	movdqa    xmm6,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm6,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm2,xmm6
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2L)
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm7=(tmp2H)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0L)
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]	; xmm3=(tmp0H)
+
+	paddd	xmm6,xmm4		; xmm6=tmp2L
+	paddd	xmm7,xmm5		; xmm7=tmp2H
+	paddd	xmm2,xmm0		; xmm2=tmp0L
+	paddd	xmm3,xmm1		; xmm3=tmp0H
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      xmm1,xmm1
+	pxor      xmm2,xmm2
+	punpcklwd xmm1,xmm4		; xmm1=tmp0L
+	punpckhwd xmm2,xmm4		; xmm2=tmp0H
+	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+	movdqa    xmm3,xmm5		; xmm5=in2=z2
+	punpcklwd xmm5,xmm0		; xmm0=in6=z3
+	punpckhwd xmm3,xmm0
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]	; xmm5=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]	; xmm3=tmp2H
+
+	movdqa	xmm4,xmm1
+	movdqa	xmm0,xmm2
+	paddd	xmm1,xmm5		; xmm1=tmp10L
+	paddd	xmm2,xmm3		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp12L
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	; -- Final output stage
+
+	movdqa	xmm5,xmm1
+	movdqa	xmm3,xmm2
+	paddd	xmm1,xmm6		; xmm1=data0L
+	paddd	xmm2,xmm7		; xmm2=data0H
+	psubd	xmm5,xmm6		; xmm5=data3L
+	psubd	xmm3,xmm7		; xmm3=data3H
+
+	movdqa	xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm6=[PD_DESCALE_P1_4]
+
+	paddd	xmm1,xmm6
+	paddd	xmm2,xmm6
+	psrad	xmm1,DESCALE_P1_4
+	psrad	xmm2,DESCALE_P1_4
+	paddd	xmm5,xmm6
+	paddd	xmm3,xmm6
+	psrad	xmm5,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
+
+	movdqa	xmm2,xmm4
+	movdqa	xmm3,xmm0
+	paddd	xmm4,xmm7		; xmm4=data1L
+	paddd	xmm0,xmm6		; xmm0=data1H
+	psubd	xmm2,xmm7		; xmm2=data2L
+	psubd	xmm3,xmm6		; xmm3=data2H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm7=[PD_DESCALE_P1_4]
+
+	paddd	xmm4,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm4,DESCALE_P1_4
+	psrad	xmm0,DESCALE_P1_4
+	paddd	xmm2,xmm7
+	paddd	xmm3,xmm7
+	psrad	xmm2,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	pxor      xmm4,xmm4
+	punpcklwd xmm4,xmm1		; xmm4=tmp0
+	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+	; -- Odd part
+
+	punpckhwd xmm1,xmm0
+	punpckhwd xmm6,xmm3
+	movdqa    xmm5,xmm1
+	movdqa    xmm2,xmm6
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]	; xmm1=(tmp2)
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]	; xmm5=(tmp0)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0)
+
+	paddd     xmm6,xmm1		; xmm6=tmp2
+	paddd     xmm2,xmm5		; xmm2=tmp0
+
+	; -- Even part
+
+	punpcklwd xmm0,xmm3
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]	; xmm0=tmp2
+
+	movdqa    xmm7,xmm4
+	paddd     xmm4,xmm0		; xmm4=tmp10
+	psubd     xmm7,xmm0		; xmm7=tmp12
+
+	; -- Final output stage
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; xmm1=[PD_DESCALE_P2_4]
+
+	movdqa	xmm5,xmm4
+	movdqa	xmm3,xmm7
+	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
+	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
+	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
+	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+
+	paddd	xmm4,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm4,DESCALE_P2_4
+	psrad	xmm7,DESCALE_P2_4
+	paddd	xmm5,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm5,DESCALE_P2_4
+	psrad	xmm3,DESCALE_P2_4
+
+	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
+	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
+
+	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
+	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
+
+	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+	paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [dct_table(ebp)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+	pcmpeqd   xmm7,xmm7
+	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
+	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
+	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
+	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
+	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
+	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
+	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
+	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
+	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
+	pmaddwd	xmm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	xmm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
+	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
+
+	; -- Even part
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
+	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
+	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
+	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+	; -- Final output stage
+
+	movdqa	xmm3,xmm6
+	movdqa	xmm5,xmm1
+	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; xmm2=[PD_DESCALE_P1_2]
+
+	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
+
+	movdqa     xmm7,xmm1
+	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
+	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
+
+	paddd	xmm6,xmm2
+	psrad	xmm6,DESCALE_P1_2
+
+	paddd	xmm1,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm1,DESCALE_P1_2
+	psrad	xmm7,DESCALE_P1_2
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
+
+	; -- Even part
+
+	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
+
+	; -- Final output stage
+
+	movdqa    xmm4,xmm6
+	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
+
+	paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
+	psrad     xmm6,DESCALE_P2_2
+
+	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+	paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
+	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jisseflt.asm b/jpeg/simd/jisseflt.asm
new file mode 100644
index 000000000000..d6147c12d4ca
--- /dev/null
+++ b/jpeg/simd/jisseflt.asm
@@ -0,0 +1,572 @@
+;
+; jisseflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_0_125	times 4 dd  0.125	; 1/8
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
+	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
+
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
+	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
+
+	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
+	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
+	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
+	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
+	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
+	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
+	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
+	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
+
+	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
+	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
+	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
+	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
+	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
+	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
+	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
+	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
+	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
+	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
+
+	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
+	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
+	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
+	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
+
+	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
+	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
+
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
+	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
+	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
+	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
+	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
+
+	mulps	xmm6,xmm1		; descale(1/8)
+	mulps	xmm7,xmm1		; descale(1/8)
+	mulps	xmm5,xmm1		; descale(1/8)
+	mulps	xmm0,xmm1		; descale(1/8)
+
+	movhlps   xmm3,xmm6
+	movhlps   xmm1,xmm7
+	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
+	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
+	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
+	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
+
+	movhlps   xmm6,xmm5
+	movhlps   xmm7,xmm0
+	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
+	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
+	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
+	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
+	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
+	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
+
+	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
+	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
+
+	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
+	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm5,xmm3
+	movaps	xmm0,xmm1
+	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
+	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
+	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
+
+	mulps	xmm3,xmm6		; descale(1/8)
+	mulps	xmm1,xmm6		; descale(1/8)
+	mulps	xmm5,xmm6		; descale(1/8)
+	mulps	xmm0,xmm6		; descale(1/8)
+
+	movhlps   xmm7,xmm3
+	movhlps   xmm2,xmm1
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
+	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
+	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
+	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
+	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
+
+	movhlps   xmm4,xmm5
+	movhlps   xmm6,xmm0
+	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
+	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
+	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
+	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
+	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
+	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm6
+	paddb     mm1,mm6
+	paddb     mm2,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
+	movq      mm3,mm2		; transpose coefficients(phase 1)
+	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
+	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm6,mm3		; transpose coefficients(phase 2)
+	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
+	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
+
+	movq      mm1,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jsimd.h b/jpeg/simd/jsimd.h
new file mode 100644
index 000000000000..89ac1b75ebfb
--- /dev/null
+++ b/jpeg/simd/jsimd.h
@@ -0,0 +1,504 @@
+/*
+ * simd/jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Bitmask for supported acceleration methods */
+
+#define JSIMD_NONE    0x00
+#define JSIMD_MMX     0x01
+#define JSIMD_3DNOW   0x02
+#define JSIMD_SSE     0x04
+#define JSIMD_SSE2    0x08
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_simd_cpu_support                 jSiCpuSupport
+#define jsimd_rgb_ycc_convert_mmx             jSRGBYCCM
+#define jsimd_extrgb_ycc_convert_mmx          jSEXTRGBYCCM
+#define jsimd_extrgbx_ycc_convert_mmx         jSEXTRGBXYCCM
+#define jsimd_extbgr_ycc_convert_mmx          jSEXTBGRYCCM
+#define jsimd_extbgrx_ycc_convert_mmx         jSEXTBGRXYCCM
+#define jsimd_extxbgr_ycc_convert_mmx         jSEXTXBGRYCCM
+#define jsimd_extxrgb_ycc_convert_mmx         jSEXTXRGBYCCM
+#define jsimd_ycc_rgb_convert_mmx             jSYCCRGBM
+#define jsimd_ycc_extrgb_convert_mmx          jSYCCEXTRGBM
+#define jsimd_ycc_extrgbx_convert_mmx         jSYCCEXTRGBXM
+#define jsimd_ycc_extbgr_convert_mmx          jSYCCEXTBGRM
+#define jsimd_ycc_extbgrx_convert_mmx         jSYCCEXTBGRXM
+#define jsimd_ycc_extxbgr_convert_mmx         jSYCCEXTXBGRM
+#define jsimd_ycc_extxrgb_convert_mmx         jSYCCEXTXRGBM
+#define jconst_rgb_ycc_convert_sse2           jSCRGBYCCS2
+#define jsimd_rgb_ycc_convert_sse2            jSRGBYCCS2
+#define jsimd_extrgb_ycc_convert_sse2         jSEXTRGBYCCS2
+#define jsimd_extrgbx_ycc_convert_sse2        jSEXTRGBXYCCS2
+#define jsimd_extbgr_ycc_convert_sse2         jSEXTBGRYCCS2
+#define jsimd_extbgrx_ycc_convert_sse2        jSEXTBGRXYCCS2
+#define jsimd_extxbgr_ycc_convert_sse2        jSEXTXBGRYCCS2
+#define jsimd_extxrgb_ycc_convert_sse2        jSEXTXRGBYCCS2
+#define jconst_ycc_rgb_convert_sse2           jSCYCCRGBS2
+#define jsimd_ycc_rgb_convert_sse2            jSYCCRGBS2
+#define jsimd_ycc_extrgb_convert_sse2         jSYCCEXTRGBS2
+#define jsimd_ycc_extrgbx_convert_sse2        jSYCCEXTRGBXS2
+#define jsimd_ycc_extbgr_convert_sse2         jSYCCEXTBGRS2
+#define jsimd_ycc_extbgrx_convert_sse2        jSYCCEXTBGRXS2
+#define jsimd_ycc_extxbgr_convert_sse2        jSYCCEXTXBGRS2
+#define jsimd_ycc_extxrgb_convert_sse2        jSYCCEXTXRGBS2
+#define jsimd_h2v2_downsample_mmx             jSDnH2V2M
+#define jsimd_h2v1_downsample_mmx             jSDnH2V1M
+#define jsimd_h2v2_downsample_sse2            jSDnH2V2S2
+#define jsimd_h2v1_downsample_sse2            jSDnH2V1S2
+#define jsimd_h2v2_upsample_mmx               jSUpH2V2M
+#define jsimd_h2v1_upsample_mmx               jSUpH2V1M
+#define jsimd_h2v2_fancy_upsample_mmx         jSFUpH2V2M
+#define jsimd_h2v1_fancy_upsample_mmx         jSFUpH2V1M
+#define jsimd_h2v2_merged_upsample_mmx        jSMUpH2V2M
+#define jsimd_h2v2_extrgb_merged_upsample_mmx jSMUpH2V2EXTRGBM
+#define jsimd_h2v2_extrgbx_merged_upsample_mmx jSMUpH2V2EXTRGBXM
+#define jsimd_h2v2_extbgr_merged_upsample_mmx jSMUpH2V2EXTBGRM
+#define jsimd_h2v2_extbgrx_merged_upsample_mmx jSMUpH2V2EXTBGRXM
+#define jsimd_h2v2_extxbgr_merged_upsample_mmx jSMUpH2V2EXTXBGRM
+#define jsimd_h2v2_extxrgb_merged_upsample_mmx jSMUpH2V2EXTXRGBM
+#define jsimd_h2v1_merged_upsample_mmx        jSMUpH2V1M
+#define jsimd_h2v1_extrgb_merged_upsample_mmx jSMUpH2V1EXTRGBM
+#define jsimd_h2v1_extrgbx_merged_upsample_mmx jSMUpH2V1EXTRGBXM
+#define jsimd_h2v1_extbgr_merged_upsample_mmx jSMUpH2V1EXTBGRM
+#define jsimd_h2v1_extbgrx_merged_upsample_mmx jSMUpH2V1EXTBGRXM
+#define jsimd_h2v1_extxbgr_merged_upsample_mmx jSMUpH2V1EXTXBGRM
+#define jsimd_h2v1_extxrgb_merged_upsample_mmx jSMUpH2V1EXTXRGBM
+#define jsimd_h2v2_upsample_sse2              jSUpH2V2S2
+#define jsimd_h2v1_upsample_sse2              jSUpH2V1S2
+#define jconst_fancy_upsample_sse2            jSCFUpS2
+#define jsimd_h2v2_fancy_upsample_sse2        jSFUpH2V2S2
+#define jsimd_h2v1_fancy_upsample_sse2        jSFUpH2V1S2
+#define jconst_merged_upsample_sse2           jSCMUpS2
+#define jsimd_h2v2_merged_upsample_sse2       jSMUpH2V2S2
+#define jsimd_h2v2_extrgb_merged_upsample_sse2 jSMUpH2V2EXTRGBS2
+#define jsimd_h2v2_extrgbx_merged_upsample_sse2 jSMUpH2V2EXTRGBXS2
+#define jsimd_h2v2_extbgr_merged_upsample_sse2 jSMUpH2V2EXTBGRS2
+#define jsimd_h2v2_extbgrx_merged_upsample_sse2 jSMUpH2V2EXTBGRXS2
+#define jsimd_h2v2_extxbgr_merged_upsample_sse2 jSMUpH2V2EXTXBGRS2
+#define jsimd_h2v2_extxrgb_merged_upsample_sse2 jSMUpH2V2EXTXRGBS2
+#define jsimd_h2v1_merged_upsample_sse2       jSMUpH2V1S2
+#define jsimd_h2v1_extrgb_merged_upsample_sse2 jSMUpH2V1EXTRGBS2
+#define jsimd_h2v1_extrgbx_merged_upsample_sse2 jSMUpH2V1EXTRGBXS2
+#define jsimd_h2v1_extbgr_merged_upsample_sse2 jSMUpH2V1EXTBGRS2
+#define jsimd_h2v1_extbgrx_merged_upsample_sse2 jSMUpH2V1EXTBGRXS2
+#define jsimd_h2v1_extxbgr_merged_upsample_sse2 jSMUpH2V1EXTXBGRS2
+#define jsimd_h2v1_extxrgb_merged_upsample_sse2 jSMUpH2V1EXTXRGBS2
+#define jsimd_convsamp_mmx                    jSConvM
+#define jsimd_convsamp_sse2                   jSConvS2
+#define jsimd_convsamp_float_3dnow            jSConvF3D
+#define jsimd_convsamp_float_sse              jSConvFS
+#define jsimd_convsamp_float_sse2             jSConvFS2
+#define jsimd_fdct_islow_mmx                  jSFDMIS
+#define jsimd_fdct_ifast_mmx                  jSFDMIF
+#define jconst_fdct_islow_sse2                jSCFDS2IS
+#define jsimd_fdct_islow_sse2                 jSFDS2IS
+#define jconst_fdct_ifast_sse2                jSCFDS2IF
+#define jsimd_fdct_ifast_sse2                 jSFDS2IF
+#define jsimd_fdct_float_3dnow                jSFD3DF
+#define jconst_fdct_float_sse                 jSCFDSF
+#define jsimd_fdct_float_sse                  jSFDSF
+#define jsimd_quantize_mmx                    jSQuantM
+#define jsimd_quantize_sse2                   jSQuantS2
+#define jsimd_quantize_float_3dnow            jSQuantF3D
+#define jsimd_quantize_float_sse              jSQuantFS
+#define jsimd_quantize_float_sse2             jSQuantFS2
+#define jsimd_idct_2x2_mmx                    jSIDM22
+#define jsimd_idct_4x4_mmx                    jSIDM44
+#define jconst_idct_red_sse2                  jSCIDS2R
+#define jsimd_idct_2x2_sse2                   jSIDS222
+#define jsimd_idct_4x4_sse2                   jSIDS244
+#define jsimd_idct_islow_mmx                  jSIDMIS
+#define jsimd_idct_ifast_mmx                  jSIDMIF
+#define jconst_idct_islow_sse2                jSCIDS2IS
+#define jsimd_idct_islow_sse2                 jSIDS2IS
+#define jconst_idct_ifast_sse2                jSCIDS2IF
+#define jsimd_idct_ifast_sse2                 jSIDS2IF
+#define jsimd_idct_float_3dnow                jSID3DF
+#define jconst_fdct_float_sse                 jSCIDSF
+#define jsimd_idct_float_sse                  jSIDSF
+#define jconst_fdct_float_sse2                jSCIDS2F
+#define jsimd_idct_float_sse2                 jSIDS2F
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
+
+/* SIMD Color Space Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+/* SIMD Downsample */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(void) jsimd_h2v2_downsample_sse2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_sse2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+/* SIMD Upsample */
+EXTERN(void) jsimd_h2v2_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+EXTERN(void) jsimd_h2v2_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+/* SIMD Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
+                                     JDIMENSION start_col,
+                                     DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data,
+                                      JDIMENSION start_col,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
+                                             JDIMENSION start_col,
+                                             FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data,
+                                           JDIMENSION start_col,
+                                           FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data,
+                                            JDIMENSION start_col,
+                                            FAST_FLOAT * workspace));
+
+/* SIMD Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
+
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data));
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
+
+EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
+
+extern const int jconst_fdct_float_sse[];
+EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data));
+
+/* SIMD Quantization */
+EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
+                                     DCTELEM * divisors,
+                                     DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block,
+                                      DCTELEM * divisors,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
+                                             FAST_FLOAT * divisors,
+                                             FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block,
+                                           FAST_FLOAT * divisors,
+                                           FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block,
+                                            FAST_FLOAT * divisors,
+                                            FAST_FLOAT * workspace));
+
+/* SIMD Reduced Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+
+/* SIMD Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
+                                         JCOEFPTR coef_block,
+                                         JSAMPARRAY output_buf,
+                                         JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse[];
+EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
diff --git a/jpeg/simd/jsimd_i386.c b/jpeg/simd/jsimd_i386.c
new file mode 100644
index 000000000000..d9bb774352a5
--- /dev/null
+++ b/jpeg/simd/jsimd_i386.c
@@ -0,0 +1,957 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+
+static unsigned int simd_support = ~0;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMX");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_MMX;
+  env = getenv("JSIMD_FORCE3DNOW");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_3DNOW|JSIMD_MMX;
+  env = getenv("JSIMD_FORCESSE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_SSE|JSIMD_MMX;
+  env = getenv("JSIMD_FORCESSE2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_SSE2;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_extrgb_ycc_convert_sse2;
+      mmxfct=jsimd_extrgb_ycc_convert_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
+      mmxfct=jsimd_extrgbx_ycc_convert_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_extbgr_ycc_convert_sse2;
+      mmxfct=jsimd_extbgr_ycc_convert_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
+      mmxfct=jsimd_extbgrx_ycc_convert_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
+      mmxfct=jsimd_extxbgr_ycc_convert_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
+      mmxfct=jsimd_extxrgb_ycc_convert_mmx;
+      break;
+    default:
+      sse2fct=jsimd_rgb_ycc_convert_sse2;
+      mmxfct=jsimd_rgb_ycc_convert_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    sse2fct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_ycc_extrgb_convert_sse2;
+      mmxfct=jsimd_ycc_extrgb_convert_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
+      mmxfct=jsimd_ycc_extrgbx_convert_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_ycc_extbgr_convert_sse2;
+      mmxfct=jsimd_ycc_extbgr_convert_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
+      mmxfct=jsimd_ycc_extbgrx_convert_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
+      mmxfct=jsimd_ycc_extxbgr_convert_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
+      mmxfct=jsimd_ycc_extxrgb_convert_mmx;
+      break;
+    default:
+      sse2fct=jsimd_ycc_rgb_convert_sse2;
+      mmxfct=jsimd_ycc_rgb_convert_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    sse2fct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
+      break;
+    default:
+      sse2fct=jsimd_h2v2_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_merged_upsample_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    sse2fct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
+      break;
+    default:
+      sse2fct=jsimd_h2v1_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_merged_upsample_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    sse2fct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_islow_sse2(data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    jsimd_fdct_float_sse(data);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_quantize_float_sse(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block,
+        output_buf, output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    jsimd_idct_float_sse(compptr->dct_table, coef_block,
+        output_buf, output_col);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
+        output_buf, output_col);
+}
+
diff --git a/jpeg/simd/jsimd_x86_64.c b/jpeg/simd/jsimd_x86_64.c
new file mode 100644
index 000000000000..7659249e14c0
--- /dev/null
+++ b/jpeg/simd/jsimd_x86_64.c
@@ -0,0 +1,681 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * x86_64 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_extrgb_ycc_convert_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_extbgr_ycc_convert_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
+      break;
+    default:
+      sse2fct=jsimd_rgb_ycc_convert_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_ycc_extrgb_convert_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_ycc_extbgr_convert_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
+      break;
+    default:
+      sse2fct=jsimd_ycc_rgb_convert_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_sse2(cinfo->image_width,
+                             cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor,
+                             compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_sse2(cinfo->image_width,
+                             cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor,
+                             compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
+                           cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
+                           cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width,
+                                 input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width,
+                                 input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+      break;
+    default:
+      sse2fct=jsimd_h2v2_merged_upsample_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+      break;
+    default:
+      sse2fct=jsimd_h2v1_merged_upsample_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+  jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+  jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+  jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+  jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+  jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block,
+                        output_buf, output_col);
+}
+
diff --git a/jpeg/simd/jsimdcfg.inc b/jpeg/simd/jsimdcfg.inc
new file mode 100644
index 000000000000..68e22e8691f1
--- /dev/null
+++ b/jpeg/simd/jsimdcfg.inc
@@ -0,0 +1,69 @@
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+;
+; -- jpeglib.h
+;
+%define DCTSIZE 8
+%define DCTSIZE2 64
+;
+; -- jmorecfg.h
+;
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+%define CENTERJSAMPLE 128
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+;
+; -- jdct.h
+;
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+%define float FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
+; To maximize parallelism, Type short is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+;
+; -- jsimd.h
+;
+%define JSIMD_NONE 0x00
+%define JSIMD_MMX 0x01
+%define JSIMD_3DNOW 0x02
+%define JSIMD_SSE 0x04
+%define JSIMD_SSE2 0x08
+; Short forms of external names for systems with brain-damaged linkers.
+;
diff --git a/jpeg/simd/jsimdcfg.inc.h b/jpeg/simd/jsimdcfg.inc.h
new file mode 100644
index 000000000000..4876038bc8c5
--- /dev/null
+++ b/jpeg/simd/jsimdcfg.inc.h
@@ -0,0 +1,168 @@
+// This file generates the include file for the assembly
+// implementations by abusing the C preprocessor.
+//
+// Note: Some things are manually defined as they need to
+// be mapped to NASM types.
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+#define define(var) %define _cpp_protection_##var
+#define definev(var) %define _cpp_protection_##var var
+
+;
+; -- jpeglib.h
+;
+
+definev(DCTSIZE)
+definev(DCTSIZE2)
+
+;
+; -- jmorecfg.h
+;
+
+definev(RGB_RED)
+definev(RGB_GREEN)
+definev(RGB_BLUE)
+
+definev(RGB_PIXELSIZE)
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE                 byte          ; unsigned char
+%define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
+
+definev(CENTERJSAMPLE)
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF                   word          ; short
+%define SIZEOF_JCOEF            SIZEOF_WORD   ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION              dword         ; unsigned int
+%define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
+
+%define JSAMPROW                POINTER       ; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR                POINTER       ; JCOEF FAR *   (jpeglib.h)
+%define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR         SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word          ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD   ; sizeof(DCTELEM)
+
+%define FAST_FLOAT              FP32            ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32     ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word          ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD   ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word          ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD   ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2             ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE         FP32          ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32   ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+definev(JSIMD_NONE)
+definev(JSIMD_MMX)
+definev(JSIMD_3DNOW)
+definev(JSIMD_SSE)
+definev(JSIMD_SSE2)
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+definev(jpeg_simd_cpu_support)
+definev(jsimd_rgb_ycc_convert_mmx)
+definev(jsimd_ycc_rgb_convert_mmx)
+definev(jconst_rgb_ycc_convert_sse2)
+definev(jsimd_rgb_ycc_convert_sse2)
+definev(jconst_ycc_rgb_convert_sse2)
+definev(jsimd_ycc_rgb_convert_sse2)
+definev(jsimd_h2v2_downsample_mmx)
+definev(jsimd_h2v1_downsample_mmx)
+definev(jsimd_h2v2_downsample_sse2)
+definev(jsimd_h2v1_downsample_sse2)
+definev(jsimd_h2v2_upsample_mmx)
+definev(jsimd_h2v1_upsample_mmx)
+definev(jsimd_h2v1_fancy_upsample_mmx)
+definev(jsimd_h2v2_fancy_upsample_mmx)
+definev(jsimd_h2v1_merged_upsample_mmx)
+definev(jsimd_h2v2_merged_upsample_mmx)
+definev(jsimd_h2v2_upsample_sse2)
+definev(jsimd_h2v1_upsample_sse2)
+definev(jconst_fancy_upsample_sse2)
+definev(jsimd_h2v1_fancy_upsample_sse2)
+definev(jsimd_h2v2_fancy_upsample_sse2)
+definev(jconst_merged_upsample_sse2)
+definev(jsimd_h2v1_merged_upsample_sse2)
+definev(jsimd_h2v2_merged_upsample_sse2)
+definev(jsimd_convsamp_mmx)
+definev(jsimd_convsamp_sse2)
+definev(jsimd_convsamp_float_3dnow)
+definev(jsimd_convsamp_float_sse)
+definev(jsimd_convsamp_float_sse2)
+definev(jsimd_fdct_islow_mmx)
+definev(jsimd_fdct_ifast_mmx)
+definev(jconst_fdct_islow_sse2)
+definev(jsimd_fdct_islow_sse2)
+definev(jconst_fdct_ifast_sse2)
+definev(jsimd_fdct_ifast_sse2)
+definev(jsimd_fdct_float_3dnow)
+definev(jconst_fdct_float_sse)
+definev(jsimd_fdct_float_sse)
+definev(jsimd_quantize_mmx)
+definev(jsimd_quantize_sse2)
+definev(jsimd_quantize_float_3dnow)
+definev(jsimd_quantize_float_sse)
+definev(jsimd_quantize_float_sse2)
+definev(jsimd_idct_2x2_mmx)
+definev(jsimd_idct_4x4_mmx)
+definev(jconst_idct_red_sse2)
+definev(jsimd_idct_2x2_sse2)
+definev(jsimd_idct_4x4_sse2)
+definev(jsimd_idct_islow_mmx)
+definev(jsimd_idct_ifast_mmx)
+definev(jconst_idct_islow_sse2)
+definev(jsimd_idct_islow_sse2)
+definev(jconst_idct_ifast_sse2)
+definev(jsimd_idct_ifast_sse2)
+definev(jsimd_idct_float_3dnow)
+definev(jconst_idct_float_sse)
+definev(jsimd_idct_float_sse)
+definev(jconst_idct_float_sse2)
+definev(jsimd_idct_float_sse2)
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
diff --git a/jpeg/simd/jsimdcpu.asm b/jpeg/simd/jsimdcpu.asm
new file mode 100644
index 000000000000..bdbcc2317964
--- /dev/null
+++ b/jpeg/simd/jsimdcpu.asm
@@ -0,0 +1,105 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support (void)
+;
+
+	align	16
+	global	EXTN(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+	push	edi
+
+	xor	edi,edi			; simd support flag
+
+	pushfd
+	pop	eax
+	mov	edx,eax
+	xor	eax, 1<<21		; flip ID bit in EFLAGS
+	push	eax
+	popfd
+	pushfd
+	pop	eax
+	xor	eax,edx
+	jz	short .return		; CPUID is not supported
+
+	; Check for MMX instruction support
+	xor	eax,eax
+	cpuid
+	test	eax,eax
+	jz	short .return
+
+	xor	eax,eax
+	inc	eax
+	cpuid
+	mov	eax,edx			; eax = Standard feature flags
+
+	test	eax, 1<<23		; bit23:MMX
+	jz	short .no_mmx
+	or	edi, byte JSIMD_MMX
+.no_mmx:
+	test	eax, 1<<25		; bit25:SSE
+	jz	short .no_sse
+	or	edi, byte JSIMD_SSE
+.no_sse:
+	test	eax, 1<<26		; bit26:SSE2
+	jz	short .no_sse2
+	or	edi, byte JSIMD_SSE2
+.no_sse2:
+
+	; Check for 3DNow! instruction support
+	mov	eax, 0x80000000
+	cpuid
+	cmp	eax, 0x80000000
+	jbe	short .return
+
+	mov	eax, 0x80000001
+	cpuid
+	mov	eax,edx			; eax = Extended feature flags
+
+	test	eax, 1<<31		; bit31:3DNow!(vendor independent)
+	jz	short .no_3dnow
+	or	edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+	mov	eax,edi
+
+	pop	edi
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
diff --git a/jpeg/simd/jsimdext.inc b/jpeg/simd/jsimdext.inc
new file mode 100644
index 000000000000..4ea3d17c577a
--- /dev/null
+++ b/jpeg/simd/jsimdext.inc
@@ -0,0 +1,372 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2010 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; [TAB8]
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32	; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .rdata align=16 public use32 class=CONST
+
+%elifdef WIN64	; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use64 class=CODE
+%define SEG_CONST   .rdata align=16 public use64 class=CONST
+%ifdef MSVC
+%define EXTN(name)  name			; foo() -> foo
+%endif
+
+%elifdef OBJ32	; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .data  align=16 public use32 class=DATA
+
+%elifdef ELF	; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT    .text   progbits align=16
+%define SEG_CONST   .rodata progbits align=16
+%else
+%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
+%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC
+%define EXTN(name)  name			; foo() -> foo
+
+%elifdef AOUT	; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_	; BSD-style a.out supports PIC
+
+%elifdef MACHO	; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  ;align=16	; nasm doesn't accept align=16. why?
+%define SEG_CONST   .rodata align=16
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_		; Mach-O style code-relative addressing
+
+%else		; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+%endif	; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+;  Common types
+;
+%ifdef __x86_64__
+%define POINTER                 qword           ; general pointer type
+%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%else
+%define POINTER                 dword           ; general pointer type
+%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
+%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%endif
+
+%define INT                     dword           ; signed integer type
+%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define FP32                    dword           ; IEEE754 single
+%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
+%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD                  qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD                                 ; int128 (SSE register)
+%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
+%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE             1               ; sizeof(BYTE)
+%define SIZEOF_WORD             2               ; sizeof(WORD)
+%define SIZEOF_DWORD            4               ; sizeof(DWORD)
+%define SIZEOF_QWORD            8               ; sizeof(QWORD)
+%define SIZEOF_OWORD            16              ; sizeof(OWORD)
+
+%define BYTE_BIT                8               ; CHAR_BIT in C
+%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
+%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)   _ %+ name		; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+	SECTION	SEG_CONST
+const_base:
+
+%define GOTOFF(got,sym) (got) + (sym) - const_base
+
+%imacro get_GOT	1
+	; NOTE: this macro destroys ecx resister.
+	call	%%geteip
+	add	ecx, byte (%%ref - $)
+	jmp	short %%adjust
+%%geteip:
+	mov	ecx, POINTER [esp]
+	ret
+%%adjust:
+	push	ebp
+	xor	ebp,ebp		; ebp = 0
+%ifidni %1,ebx	; (%1 == ebx)
+	; db 0x8D,0x9C + jmp near const_base =
+	;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+	db	0x8D,0x9C		; 8D,9C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+	; db 0x8D,0x8C + jmp near const_base =
+	;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+	db	0x8D,0x8C		; 8D,8C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:	mov	%1, ecx
+%endif ; (%1 == ebx)
+	pop	ebp
+%endmacro
+
+%else	; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT	1
+	extern	GOT_SYMBOL
+	call	%%geteip
+	add	%1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+	jmp	short %%done
+%%geteip:
+	mov	%1, POINTER [esp]
+	ret
+%%done:
+%endmacro
+
+%endif	; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic	1.nolist
+	push	%1
+%endmacro
+%imacro poppic	1.nolist
+	pop	%1
+%endmacro
+%imacro movpic	2.nolist
+	mov	%1,%2
+%endmacro
+
+%else	; !PIC -----------------------------------------
+
+%define GOTOFF(got,sym) (sym)
+
+%imacro get_GOT	1.nolist
+%endmacro
+%imacro pushpic	1.nolist
+%endmacro
+%imacro poppic	1.nolist
+%endmacro
+%imacro movpic	2.nolist
+%endmacro
+
+%endif	;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b,n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs:	times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
+	       db 0x90                               ; nop
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
+	       db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
+	       db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
+	       db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
+	       db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
+	       db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
+	       db 0x8B,0xED                          ; mov ebp,ebp
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
+	       db 0x90                               ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+	align %1, db 0		; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 0
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	mov r10, rcx
+	mov r11, rdx
+	mov r12, r8
+	mov r13, r9
+	mov r14, [rax+48]
+	mov r15, [rax+56]
+	push rsi
+	push rdi
+	sub     rsp, SIZEOF_XMMWORD
+	movlpd  XMMWORD [rsp], xmm6
+	sub     rsp, SIZEOF_XMMWORD
+	movlpd  XMMWORD [rsp], xmm7
+%endmacro
+
+%imacro uncollect_args 0
+	movlpd  xmm7, XMMWORD [rsp]
+	add     rsp, SIZEOF_XMMWORD
+	movlpd  xmm6, XMMWORD [rsp]
+	add     rsp, SIZEOF_XMMWORD
+	pop rdi
+	pop rsi
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+%endmacro
+
+%else
+
+%imacro collect_args 0
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	mov r10, rdi
+	mov r11, rsi
+	mov r12, rdx
+	mov r13, rcx
+	mov r14, r8
+	mov r15, r9
+%endmacro
+
+%imacro uncollect_args 0
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+;  Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/jpeg/structure.doc b/jpeg/structure.doc
deleted file mode 100644
index 51c9def7e5d0..000000000000
--- a/jpeg/structure.doc
+++ /dev/null
@@ -1,948 +0,0 @@
-IJG JPEG LIBRARY:  SYSTEM ARCHITECTURE
-
-Copyright (C) 1991-1995, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-This file provides an overview of the architecture of the IJG JPEG software;
-that is, the functions of the various modules in the system and the interfaces
-between modules.  For more precise details about any data structure or calling
-convention, see the include files and comments in the source code.
-
-We assume that the reader is already somewhat familiar with the JPEG standard.
-The README file includes references for learning about JPEG.  The file
-libjpeg.doc describes the library from the viewpoint of an application
-programmer using the library; it's best to read that file before this one.
-Also, the file coderules.doc describes the coding style conventions we use.
-
-In this document, JPEG-specific terminology follows the JPEG standard:
-  A "component" means a color channel, e.g., Red or Luminance.
-  A "sample" is a single component value (i.e., one number in the image data).
-  A "coefficient" is a frequency coefficient (a DCT transform output number).
-  A "block" is an 8x8 group of samples or coefficients.
-  An "MCU" (minimum coded unit) is an interleaved set of blocks of size
-	determined by the sampling factors, or a single block in a
-	noninterleaved scan.
-We do not use the terms "pixel" and "sample" interchangeably.  When we say
-pixel, we mean an element of the full-size image, while a sample is an element
-of the downsampled image.  Thus the number of samples may vary across
-components while the number of pixels does not.  (This terminology is not used
-rigorously throughout the code, but it is used in places where confusion would
-otherwise result.)
-
-
-*** System features ***
-
-The IJG distribution contains two parts:
-  * A subroutine library for JPEG compression and decompression.
-  * cjpeg/djpeg, two sample applications that use the library to transform
-    JFIF JPEG files to and from several other image formats.
-cjpeg/djpeg are of no great intellectual complexity: they merely add a simple
-command-line user interface and I/O routines for several uncompressed image
-formats.  This document concentrates on the library itself.
-
-We desire the library to be capable of supporting all JPEG baseline, extended
-sequential, and progressive DCT processes.  Hierarchical processes are not
-supported.
-
-The library does not support the lossless (spatial) JPEG process.  Lossless
-JPEG shares little or no code with lossy JPEG, and would normally be used
-without the extensive pre- and post-processing provided by this library.
-We feel that lossless JPEG is better handled by a separate library.
-
-Within these limits, any set of compression parameters allowed by the JPEG
-spec should be readable for decompression.  (We can be more restrictive about
-what formats we can generate.)  Although the system design allows for all
-parameter values, some uncommon settings are not yet implemented and may
-never be; nonintegral sampling ratios are the prime example.  Furthermore,
-we treat 8-bit vs. 12-bit data precision as a compile-time switch, not a
-run-time option, because most machines can store 8-bit pixels much more
-compactly than 12-bit.
-
-For legal reasons, JPEG arithmetic coding is not currently supported, but
-extending the library to include it would be straightforward.
-
-By itself, the library handles only interchange JPEG datastreams --- in
-particular the widely used JFIF file format.  The library can be used by
-surrounding code to process interchange or abbreviated JPEG datastreams that
-are embedded in more complex file formats.  (For example, libtiff uses this
-library to implement JPEG compression within the TIFF file format.)
-
-The library includes a substantial amount of code that is not covered by the
-JPEG standard but is necessary for typical applications of JPEG.  These
-functions preprocess the image before JPEG compression or postprocess it after
-decompression.  They include colorspace conversion, downsampling/upsampling,
-and color quantization.  This code can be omitted if not needed.
-
-A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
-and even more so in decompression postprocessing.  The decompression library
-provides multiple implementations that cover most of the useful tradeoffs,
-ranging from very-high-quality down to fast-preview operation.  On the
-compression side we have generally not provided low-quality choices, since
-compression is normally less time-critical.  It should be understood that the
-low-quality modes may not meet the JPEG standard's accuracy requirements;
-nonetheless, they are useful for viewers.
-
-
-*** Portability issues ***
-
-Portability is an essential requirement for the library.  The key portability
-issues that show up at the level of system architecture are:
-
-1.  Memory usage.  We want the code to be able to run on PC-class machines
-with limited memory.  Images should therefore be processed sequentially (in
-strips), to avoid holding the whole image in memory at once.  Where a
-full-image buffer is necessary, we should be able to use either virtual memory
-or temporary files.
-
-2.  Near/far pointer distinction.  To run efficiently on 80x86 machines, the
-code should distinguish "small" objects (kept in near data space) from
-"large" ones (kept in far data space).  This is an annoying restriction, but
-fortunately it does not impact code quality for less brain-damaged machines,
-and the source code clutter turns out to be minimal with sufficient use of
-pointer typedefs.
-
-3. Data precision.  We assume that "char" is at least 8 bits, "short" and
-"int" at least 16, "long" at least 32.  The code will work fine with larger
-data sizes, although memory may be used inefficiently in some cases.  However,
-the JPEG compressed datastream must ultimately appear on external storage as a
-sequence of 8-bit bytes if it is to conform to the standard.  This may pose a
-problem on machines where char is wider than 8 bits.  The library represents
-compressed data as an array of values of typedef JOCTET.  If no data type
-exactly 8 bits wide is available, custom data source and data destination
-modules must be written to unpack and pack the chosen JOCTET datatype into
-8-bit external representation.
-
-
-*** System overview ***
-
-The compressor and decompressor are each divided into two main sections:
-the JPEG compressor or decompressor proper, and the preprocessing or
-postprocessing functions.  The interface between these two sections is the
-image data that the official JPEG spec regards as its input or output: this
-data is in the colorspace to be used for compression, and it is downsampled
-to the sampling factors to be used.  The preprocessing and postprocessing
-steps are responsible for converting a normal image representation to or from
-this form.  (Those few applications that want to deal with YCbCr downsampled
-data can skip the preprocessing or postprocessing step.)
-
-Looking more closely, the compressor library contains the following main
-elements:
-
-  Preprocessing:
-    * Color space conversion (e.g., RGB to YCbCr).
-    * Edge expansion and downsampling.  Optionally, this step can do simple
-      smoothing --- this is often helpful for low-quality source data.
-  JPEG proper:
-    * MCU assembly, DCT, quantization.
-    * Entropy coding (sequential or progressive, Huffman or arithmetic).
-
-In addition to these modules we need overall control, marker generation,
-and support code (memory management & error handling).  There is also a
-module responsible for physically writing the output data --- typically
-this is just an interface to fwrite(), but some applications may need to
-do something else with the data.
-
-The decompressor library contains the following main elements:
-
-  JPEG proper:
-    * Entropy decoding (sequential or progressive, Huffman or arithmetic).
-    * Dequantization, inverse DCT, MCU disassembly.
-  Postprocessing:
-    * Upsampling.  Optionally, this step may be able to do more general
-      rescaling of the image.
-    * Color space conversion (e.g., YCbCr to RGB).  This step may also
-      provide gamma adjustment [ currently it does not ].
-    * Optional color quantization (e.g., reduction to 256 colors).
-    * Optional color precision reduction (e.g., 24-bit to 15-bit color).
-      [This feature is not currently implemented.]
-
-We also need overall control, marker parsing, and a data source module.
-The support code (memory management & error handling) can be shared with
-the compression half of the library.
-
-There may be several implementations of each of these elements, particularly
-in the decompressor, where a wide range of speed/quality tradeoffs is very
-useful.  It must be understood that some of the best speedups involve
-merging adjacent steps in the pipeline.  For example, upsampling, color space
-conversion, and color quantization might all be done at once when using a
-low-quality ordered-dither technique.  The system architecture is designed to
-allow such merging where appropriate.
-
-
-Note: it is convenient to regard edge expansion (padding to block boundaries)
-as a preprocessing/postprocessing function, even though the JPEG spec includes
-it in compression/decompression.  We do this because downsampling/upsampling
-can be simplified a little if they work on padded data: it's not necessary to
-have special cases at the right and bottom edges.  Therefore the interface
-buffer is always an integral number of blocks wide and high, and we expect
-compression preprocessing to pad the source data properly.  Padding will occur
-only to the next block (8-sample) boundary.  In an interleaved-scan situation,
-additional dummy blocks may be used to fill out MCUs, but the MCU assembly and
-disassembly logic will create or discard these blocks internally.  (This is
-advantageous for speed reasons, since we avoid DCTing the dummy blocks.
-It also permits a small reduction in file size, because the compressor can
-choose dummy block contents so as to minimize their size in compressed form.
-Finally, it makes the interface buffer specification independent of whether
-the file is actually interleaved or not.)  Applications that wish to deal
-directly with the downsampled data must provide similar buffering and padding
-for odd-sized images.
-
-
-*** Poor man's object-oriented programming ***
-
-It should be clear by now that we have a lot of quasi-independent processing
-steps, many of which have several possible behaviors.  To avoid cluttering the
-code with lots of switch statements, we use a simple form of object-style
-programming to separate out the different possibilities.
-
-For example, two different color quantization algorithms could be implemented
-as two separate modules that present the same external interface; at runtime,
-the calling code will access the proper module indirectly through an "object".
-
-We can get the limited features we need while staying within portable C.
-The basic tool is a function pointer.  An "object" is just a struct
-containing one or more function pointer fields, each of which corresponds to
-a method name in real object-oriented languages.  During initialization we
-fill in the function pointers with references to whichever module we have
-determined we need to use in this run.  Then invocation of the module is done
-by indirecting through a function pointer; on most machines this is no more
-expensive than a switch statement, which would be the only other way of
-making the required run-time choice.  The really significant benefit, of
-course, is keeping the source code clean and well structured.
-
-We can also arrange to have private storage that varies between different
-implementations of the same kind of object.  We do this by making all the
-module-specific object structs be separately allocated entities, which will
-be accessed via pointers in the master compression or decompression struct.
-The "public" fields or methods for a given kind of object are specified by
-a commonly known struct.  But a module's initialization code can allocate
-a larger struct that contains the common struct as its first member, plus
-additional private fields.  With appropriate pointer casting, the module's
-internal functions can access these private fields.  (For a simple example,
-see jdatadst.c, which implements the external interface specified by struct
-jpeg_destination_mgr, but adds extra fields.)
-
-(Of course this would all be a lot easier if we were using C++, but we are
-not yet prepared to assume that everyone has a C++ compiler.)
-
-An important benefit of this scheme is that it is easy to provide multiple
-versions of any method, each tuned to a particular case.  While a lot of
-precalculation might be done to select an optimal implementation of a method,
-the cost per invocation is constant.  For example, the upsampling step might
-have a "generic" method, plus one or more "hardwired" methods for the most
-popular sampling factors; the hardwired methods would be faster because they'd
-use straight-line code instead of for-loops.  The cost to determine which
-method to use is paid only once, at startup, and the selection criteria are
-hidden from the callers of the method.
-
-This plan differs a little bit from usual object-oriented structures, in that
-only one instance of each object class will exist during execution.  The
-reason for having the class structure is that on different runs we may create
-different instances (choose to execute different modules).  You can think of
-the term "method" as denoting the common interface presented by a particular
-set of interchangeable functions, and "object" as denoting a group of related
-methods, or the total shared interface behavior of a group of modules.
-
-
-*** Overall control structure ***
-
-We previously mentioned the need for overall control logic in the compression
-and decompression libraries.  In IJG implementations prior to v5, overall
-control was mostly provided by "pipeline control" modules, which proved to be
-large, unwieldy, and hard to understand.  To improve the situation, the
-control logic has been subdivided into multiple modules.  The control modules
-consist of:
-
-1. Master control for module selection and initialization.  This has two
-responsibilities:
-
-   1A.  Startup initialization at the beginning of image processing.
-        The individual processing modules to be used in this run are selected
-        and given initialization calls.
-
-   1B.  Per-pass control.  This determines how many passes will be performed
-        and calls each active processing module to configure itself
-        appropriately at the beginning of each pass.  End-of-pass processing,
-	where necessary, is also invoked from the master control module.
-
-   Method selection is partially distributed, in that a particular processing
-   module may contain several possible implementations of a particular method,
-   which it will select among when given its initialization call.  The master
-   control code need only be concerned with decisions that affect more than
-   one module.
- 
-2. Data buffering control.  A separate control module exists for each
-   inter-processing-step data buffer.  This module is responsible for
-   invoking the processing steps that write or read that data buffer.
-
-Each buffer controller sees the world as follows:
-
-input data => processing step A => buffer => processing step B => output data
-                      |              |               |
-              ------------------ controller ------------------
-
-The controller knows the dataflow requirements of steps A and B: how much data
-they want to accept in one chunk and how much they output in one chunk.  Its
-function is to manage its buffer and call A and B at the proper times.
-
-A data buffer control module may itself be viewed as a processing step by a
-higher-level control module; thus the control modules form a binary tree with
-elementary processing steps at the leaves of the tree.
-
-The control modules are objects.  A considerable amount of flexibility can
-be had by replacing implementations of a control module.  For example:
-* Merging of adjacent steps in the pipeline is done by replacing a control
-  module and its pair of processing-step modules with a single processing-
-  step module.  (Hence the possible merges are determined by the tree of
-  control modules.)
-* In some processing modes, a given interstep buffer need only be a "strip"
-  buffer large enough to accommodate the desired data chunk sizes.  In other
-  modes, a full-image buffer is needed and several passes are required.
-  The control module determines which kind of buffer is used and manipulates
-  virtual array buffers as needed.  One or both processing steps may be
-  unaware of the multi-pass behavior.
-
-In theory, we might be able to make all of the data buffer controllers
-interchangeable and provide just one set of implementations for all.  In
-practice, each one contains considerable special-case processing for its
-particular job.  The buffer controller concept should be regarded as an
-overall system structuring principle, not as a complete description of the
-task performed by any one controller.
-
-
-*** Compression object structure ***
-
-Here is a sketch of the logical structure of the JPEG compression library:
-
-                                                 |-- Colorspace conversion
-                  |-- Preprocessing controller --|
-                  |                              |-- Downsampling
-Main controller --|
-                  |                            |-- Forward DCT, quantize
-                  |-- Coefficient controller --|
-                                               |-- Entropy encoding
-
-This sketch also describes the flow of control (subroutine calls) during
-typical image data processing.  Each of the components shown in the diagram is
-an "object" which may have several different implementations available.  One
-or more source code files contain the actual implementation(s) of each object.
-
-The objects shown above are:
-
-* Main controller: buffer controller for the subsampled-data buffer, which
-  holds the preprocessed input data.  This controller invokes preprocessing to
-  fill the subsampled-data buffer, and JPEG compression to empty it.  There is
-  usually no need for a full-image buffer here; a strip buffer is adequate.
-
-* Preprocessing controller: buffer controller for the downsampling input data
-  buffer, which lies between colorspace conversion and downsampling.  Note
-  that a unified conversion/downsampling module would probably replace this
-  controller entirely.
-
-* Colorspace conversion: converts application image data into the desired
-  JPEG color space; also changes the data from pixel-interleaved layout to
-  separate component planes.  Processes one pixel row at a time.
-
-* Downsampling: performs reduction of chroma components as required.
-  Optionally may perform pixel-level smoothing as well.  Processes a "row
-  group" at a time, where a row group is defined as Vmax pixel rows of each
-  component before downsampling, and Vk sample rows afterwards (remember Vk
-  differs across components).  Some downsampling or smoothing algorithms may
-  require context rows above and below the current row group; the
-  preprocessing controller is responsible for supplying these rows via proper
-  buffering.  The downsampler is responsible for edge expansion at the right
-  edge (i.e., extending each sample row to a multiple of 8 samples); but the
-  preprocessing controller is responsible for vertical edge expansion (i.e.,
-  duplicating the bottom sample row as needed to make a multiple of 8 rows).
-
-* Coefficient controller: buffer controller for the DCT-coefficient data.
-  This controller handles MCU assembly, including insertion of dummy DCT
-  blocks when needed at the right or bottom edge.  When performing
-  Huffman-code optimization or emitting a multiscan JPEG file, this
-  controller is responsible for buffering the full image.  The equivalent of
-  one fully interleaved MCU row of subsampled data is processed per call,
-  even when the JPEG file is noninterleaved.
-
-* Forward DCT and quantization: Perform DCT, quantize, and emit coefficients.
-  Works on one or more DCT blocks at a time.  (Note: the coefficients are now
-  emitted in normal array order, which the entropy encoder is expected to
-  convert to zigzag order as necessary.  Prior versions of the IJG code did
-  the conversion to zigzag order within the quantization step.)
-
-* Entropy encoding: Perform Huffman or arithmetic entropy coding and emit the
-  coded data to the data destination module.  Works on one MCU per call.
-  For progressive JPEG, the same DCT blocks are fed to the entropy coder
-  during each pass, and the coder must emit the appropriate subset of
-  coefficients.
-
-In addition to the above objects, the compression library includes these
-objects:
-
-* Master control: determines the number of passes required, controls overall
-  and per-pass initialization of the other modules.
-
-* Marker writing: generates JPEG markers (except for RSTn, which is emitted
-  by the entropy encoder when needed).
-
-* Data destination manager: writes the output JPEG datastream to its final
-  destination (e.g., a file).  The destination manager supplied with the
-  library knows how to write to a stdio stream; for other behaviors, the
-  surrounding application may provide its own destination manager.
-
-* Memory manager: allocates and releases memory, controls virtual arrays
-  (with backing store management, where required).
-
-* Error handler: performs formatting and output of error and trace messages;
-  determines handling of nonfatal errors.  The surrounding application may
-  override some or all of this object's methods to change error handling.
-
-* Progress monitor: supports output of "percent-done" progress reports.
-  This object represents an optional callback to the surrounding application:
-  if wanted, it must be supplied by the application.
-
-The error handler, destination manager, and progress monitor objects are
-defined as separate objects in order to simplify application-specific
-customization of the JPEG library.  A surrounding application may override
-individual methods or supply its own all-new implementation of one of these
-objects.  The object interfaces for these objects are therefore treated as
-part of the application interface of the library, whereas the other objects
-are internal to the library.
-
-The error handler and memory manager are shared by JPEG compression and
-decompression; the progress monitor, if used, may be shared as well.
-
-
-*** Decompression object structure ***
-
-Here is a sketch of the logical structure of the JPEG decompression library:
-
-                                               |-- Entropy decoding
-                  |-- Coefficient controller --|
-                  |                            |-- Dequantize, Inverse DCT
-Main controller --|
-                  |                               |-- Upsampling
-                  |-- Postprocessing controller --|   |-- Colorspace conversion
-                                                  |-- Color quantization
-                                                  |-- Color precision reduction
-
-As before, this diagram also represents typical control flow.  The objects
-shown are:
-
-* Main controller: buffer controller for the subsampled-data buffer, which
-  holds the output of JPEG decompression proper.  This controller's primary
-  task is to feed the postprocessing procedure.  Some upsampling algorithms
-  may require context rows above and below the current row group; when this
-  is true, the main controller is responsible for managing its buffer so as
-  to make context rows available.  In the current design, the main buffer is
-  always a strip buffer; a full-image buffer is never required.
-
-* Coefficient controller: buffer controller for the DCT-coefficient data.
-  This controller handles MCU disassembly, including deletion of any dummy
-  DCT blocks at the right or bottom edge.  When reading a multiscan JPEG
-  file, this controller is responsible for buffering the full image.
-  (Buffering DCT coefficients, rather than samples, is necessary to support
-  progressive JPEG.)  The equivalent of one fully interleaved MCU row of
-  subsampled data is processed per call, even when the source JPEG file is
-  noninterleaved.
-
-* Entropy decoding: Read coded data from the data source module and perform
-  Huffman or arithmetic entropy decoding.  Works on one MCU per call.
-  For progressive JPEG decoding, the coefficient controller supplies the prior
-  coefficients of each MCU (initially all zeroes), which the entropy decoder
-  modifies in each scan.
-
-* Dequantization and inverse DCT: like it says.  Note that the coefficients
-  buffered by the coefficient controller have NOT been dequantized; we
-  merge dequantization and inverse DCT into a single step for speed reasons.
-  When scaled-down output is asked for, simplified DCT algorithms may be used
-  that emit only 1x1, 2x2, or 4x4 samples per DCT block, not the full 8x8.
-  Works on one DCT block at a time.
-
-* Postprocessing controller: buffer controller for the color quantization
-  input buffer, when quantization is in use.  (Without quantization, this
-  controller just calls the upsampler.)  For two-pass quantization, this
-  controller is responsible for buffering the full-image data.
-
-* Upsampling: restores chroma components to full size.  (May support more
-  general output rescaling, too.  Note that if undersized DCT outputs have
-  been emitted by the DCT module, this module must adjust so that properly
-  sized outputs are created.)  Works on one row group at a time.  This module
-  also calls the color conversion module, so its top level is effectively a
-  buffer controller for the upsampling->color conversion buffer.  However, in
-  all but the highest-quality operating modes, upsampling and color
-  conversion are likely to be merged into a single step.
-
-* Colorspace conversion: convert from JPEG color space to output color space,
-  and change data layout from separate component planes to pixel-interleaved.
-  Works on one pixel row at a time.
-
-* Color quantization: reduce the data to colormapped form, using either an
-  externally specified colormap or an internally generated one.  This module
-  is not used for full-color output.  Works on one pixel row at a time; may
-  require two passes to generate a color map.  Note that the output will
-  always be a single component representing colormap indexes.  In the current
-  design, the output values are JSAMPLEs, so an 8-bit compilation cannot
-  quantize to more than 256 colors.  This is unlikely to be a problem in
-  practice.
-
-* Color reduction: this module handles color precision reduction, e.g.,
-  generating 15-bit color (5 bits/primary) from JPEG's 24-bit output.
-  Not quite clear yet how this should be handled... should we merge it with
-  colorspace conversion???
-
-Note that some high-speed operating modes might condense the entire
-postprocessing sequence to a single module (upsample, color convert, and
-quantize in one step).
-
-In addition to the above objects, the decompression library includes these
-objects:
-
-* Master control: determines the number of passes required, controls overall
-  and per-pass initialization of the other modules.  This is subdivided into
-  input and output control: jdinput.c controls only input-side processing,
-  while jdmaster.c handles overall initialization and output-side control.
-
-* Marker reading: decodes JPEG markers (except for RSTn).
-
-* Data source manager: supplies the input JPEG datastream.  The source
-  manager supplied with the library knows how to read from a stdio stream;
-  for other behaviors, the surrounding application may provide its own source
-  manager.
-
-* Memory manager: same as for compression library.
-
-* Error handler: same as for compression library.
-
-* Progress monitor: same as for compression library.
-
-As with compression, the data source manager, error handler, and progress
-monitor are candidates for replacement by a surrounding application.
-
-
-*** Decompression input and output separation ***
-
-To support efficient incremental display of progressive JPEG files, the
-decompressor is divided into two sections that can run independently:
-
-1. Data input includes marker parsing, entropy decoding, and input into the
-   coefficient controller's DCT coefficient buffer.  Note that this
-   processing is relatively cheap and fast.
-
-2. Data output reads from the DCT coefficient buffer and performs the IDCT
-   and all postprocessing steps.
-
-For a progressive JPEG file, the data input processing is allowed to get
-arbitrarily far ahead of the data output processing.  (This occurs only
-if the application calls jpeg_consume_input(); otherwise input and output
-run in lockstep, since the input section is called only when the output
-section needs more data.)  In this way the application can avoid making
-extra display passes when data is arriving faster than the display pass
-can run.  Furthermore, it is possible to abort an output pass without
-losing anything, since the coefficient buffer is read-only as far as the
-output section is concerned.  See libjpeg.doc for more detail.
-
-A full-image coefficient array is only created if the JPEG file has multiple
-scans (or if the application specifies buffered-image mode anyway).  When
-reading a single-scan file, the coefficient controller normally creates only
-a one-MCU buffer, so input and output processing must run in lockstep in this
-case.  jpeg_consume_input() is effectively a no-op in this situation.
-
-The main impact of dividing the decompressor in this fashion is that we must
-be very careful with shared variables in the cinfo data structure.  Each
-variable that can change during the course of decompression must be
-classified as belonging to data input or data output, and each section must
-look only at its own variables.  For example, the data output section may not
-depend on any of the variables that describe the current scan in the JPEG
-file, because these may change as the data input section advances into a new
-scan.
-
-The progress monitor is (somewhat arbitrarily) defined to treat input of the
-file as one pass when buffered-image mode is not used, and to ignore data
-input work completely when buffered-image mode is used.  Note that the
-library has no reliable way to predict the number of passes when dealing
-with a progressive JPEG file, nor can it predict the number of output passes
-in buffered-image mode.  So the work estimate is inherently bogus anyway.
-
-No comparable division is currently made in the compression library, because
-there isn't any real need for it.
-
-
-*** Data formats ***
-
-Arrays of pixel sample values use the following data structure:
-
-    typedef something JSAMPLE;		a pixel component value, 0..MAXJSAMPLE
-    typedef JSAMPLE *JSAMPROW;		ptr to a row of samples
-    typedef JSAMPROW *JSAMPARRAY;	ptr to a list of rows
-    typedef JSAMPARRAY *JSAMPIMAGE;	ptr to a list of color-component arrays
-
-The basic element type JSAMPLE will typically be one of unsigned char,
-(signed) char, or short.  Short will be used if samples wider than 8 bits are
-to be supported (this is a compile-time option).  Otherwise, unsigned char is
-used if possible.  If the compiler only supports signed chars, then it is
-necessary to mask off the value when reading.  Thus, all reads of JSAMPLE
-values must be coded as "GETJSAMPLE(value)", where the macro will be defined
-as "((value) & 0xFF)" on signed-char machines and "((int) (value))" elsewhere.
-
-With these conventions, JSAMPLE values can be assumed to be >= 0.  This helps
-simplify correct rounding during downsampling, etc.  The JPEG standard's
-specification that sample values run from -128..127 is accommodated by
-subtracting 128 just as the sample value is copied into the source array for
-the DCT step (this will be an array of signed ints).  Similarly, during
-decompression the output of the IDCT step will be immediately shifted back to
-0..255.  (NB: different values are required when 12-bit samples are in use.
-The code is written in terms of MAXJSAMPLE and CENTERJSAMPLE, which will be
-defined as 255 and 128 respectively in an 8-bit implementation, and as 4095
-and 2048 in a 12-bit implementation.)
-
-We use a pointer per row, rather than a two-dimensional JSAMPLE array.  This
-choice costs only a small amount of memory and has several benefits:
-* Code using the data structure doesn't need to know the allocated width of
-  the rows.  This simplifies edge expansion/compression, since we can work
-  in an array that's wider than the logical picture width.
-* Indexing doesn't require multiplication; this is a performance win on many
-  machines.
-* Arrays with more than 64K total elements can be supported even on machines
-  where malloc() cannot allocate chunks larger than 64K.
-* The rows forming a component array may be allocated at different times
-  without extra copying.  This trick allows some speedups in smoothing steps
-  that need access to the previous and next rows.
-
-Note that each color component is stored in a separate array; we don't use the
-traditional layout in which the components of a pixel are stored together.
-This simplifies coding of modules that work on each component independently,
-because they don't need to know how many components there are.  Furthermore,
-we can read or write each component to a temporary file independently, which
-is helpful when dealing with noninterleaved JPEG files.
-
-In general, a specific sample value is accessed by code such as
-	GETJSAMPLE(image[colorcomponent][row][col])
-where col is measured from the image left edge, but row is measured from the
-first sample row currently in memory.  Either of the first two indexings can
-be precomputed by copying the relevant pointer.
-
-
-Since most image-processing applications prefer to work on images in which
-the components of a pixel are stored together, the data passed to or from the
-surrounding application uses the traditional convention: a single pixel is
-represented by N consecutive JSAMPLE values, and an image row is an array of
-(# of color components)*(image width) JSAMPLEs.  One or more rows of data can
-be represented by a pointer of type JSAMPARRAY in this scheme.  This scheme is
-converted to component-wise storage inside the JPEG library.  (Applications
-that want to skip JPEG preprocessing or postprocessing will have to contend
-with component-wise storage.)
-
-
-Arrays of DCT-coefficient values use the following data structure:
-
-    typedef short JCOEF;		a 16-bit signed integer
-    typedef JCOEF JBLOCK[DCTSIZE2];	an 8x8 block of coefficients
-    typedef JBLOCK *JBLOCKROW;		ptr to one horizontal row of 8x8 blocks
-    typedef JBLOCKROW *JBLOCKARRAY;	ptr to a list of such rows
-    typedef JBLOCKARRAY *JBLOCKIMAGE;	ptr to a list of color component arrays
-
-The underlying type is at least a 16-bit signed integer; while "short" is big
-enough on all machines of interest, on some machines it is preferable to use
-"int" for speed reasons, despite the storage cost.  Coefficients are grouped
-into 8x8 blocks (but we always use #defines DCTSIZE and DCTSIZE2 rather than
-"8" and "64").
-
-The contents of a coefficient block may be in either "natural" or zigzagged
-order, and may be true values or divided by the quantization coefficients,
-depending on where the block is in the processing pipeline.  In the current
-library, coefficient blocks are kept in natural order everywhere; the entropy
-codecs zigzag or dezigzag the data as it is written or read.  The blocks
-contain quantized coefficients everywhere outside the DCT/IDCT subsystems.
-(This latter decision may need to be revisited to support variable
-quantization a la JPEG Part 3.)
-
-Notice that the allocation unit is now a row of 8x8 blocks, corresponding to
-eight rows of samples.  Otherwise the structure is much the same as for
-samples, and for the same reasons.
-
-On machines where malloc() can't handle a request bigger than 64Kb, this data
-structure limits us to rows of less than 512 JBLOCKs, or a picture width of
-4000+ pixels.  This seems an acceptable restriction.
-
-
-On 80x86 machines, the bottom-level pointer types (JSAMPROW and JBLOCKROW)
-must be declared as "far" pointers, but the upper levels can be "near"
-(implying that the pointer lists are allocated in the DS segment).
-We use a #define symbol FAR, which expands to the "far" keyword when
-compiling on 80x86 machines and to nothing elsewhere.
-
-
-*** Suspendable processing ***
-
-In some applications it is desirable to use the JPEG library as an
-incremental, memory-to-memory filter.  In this situation the data source or
-destination may be a limited-size buffer, and we can't rely on being able to
-empty or refill the buffer at arbitrary times.  Instead the application would
-like to have control return from the library at buffer overflow/underrun, and
-then resume compression or decompression at a later time.
-
-This scenario is supported for simple cases.  (For anything more complex, we
-recommend that the application "bite the bullet" and develop real multitasking
-capability.)  The libjpeg.doc file goes into more detail about the usage and
-limitations of this capability; here we address the implications for library
-structure.
-
-The essence of the problem is that the entropy codec (coder or decoder) must
-be prepared to stop at arbitrary times.  In turn, the controllers that call
-the entropy codec must be able to stop before having produced or consumed all
-the data that they normally would handle in one call.  That part is reasonably
-straightforward: we make the controller call interfaces include "progress
-counters" which indicate the number of data chunks successfully processed, and
-we require callers to test the counter rather than just assume all of the data
-was processed.
-
-Rather than trying to restart at an arbitrary point, the current Huffman
-codecs are designed to restart at the beginning of the current MCU after a
-suspension due to buffer overflow/underrun.  At the start of each call, the
-codec's internal state is loaded from permanent storage (in the JPEG object
-structures) into local variables.  On successful completion of the MCU, the
-permanent state is updated.  (This copying is not very expensive, and may even
-lead to *improved* performance if the local variables can be registerized.)
-If a suspension occurs, the codec simply returns without updating the state,
-thus effectively reverting to the start of the MCU.  Note that this implies
-leaving some data unprocessed in the source/destination buffer (ie, the
-compressed partial MCU).  The data source/destination module interfaces are
-specified so as to make this possible.  This also implies that the data buffer
-must be large enough to hold a worst-case compressed MCU; a couple thousand
-bytes should be enough.
-
-In a successive-approximation AC refinement scan, the progressive Huffman
-decoder has to be able to undo assignments of newly nonzero coefficients if it
-suspends before the MCU is complete, since decoding requires distinguishing
-previously-zero and previously-nonzero coefficients.  This is a bit tedious
-but probably won't have much effect on performance.  Other variants of Huffman
-decoding need not worry about this, since they will just store the same values
-again if forced to repeat the MCU.
-
-This approach would probably not work for an arithmetic codec, since its
-modifiable state is quite large and couldn't be copied cheaply.  Instead it
-would have to suspend and resume exactly at the point of the buffer end.
-
-The JPEG marker reader is designed to cope with suspension at an arbitrary
-point.  It does so by backing up to the start of the marker parameter segment,
-so the data buffer must be big enough to hold the largest marker of interest.
-Again, a couple KB should be adequate.  (A special "skip" convention is used
-to bypass COM and APPn markers, so these can be larger than the buffer size
-without causing problems; otherwise a 64K buffer would be needed in the worst
-case.)
-
-The JPEG marker writer currently does *not* cope with suspension.  I feel that
-this is not necessary; it is much easier simply to require the application to
-ensure there is enough buffer space before starting.  (An empty 2K buffer is
-more than sufficient for the header markers; and ensuring there are a dozen or
-two bytes available before calling jpeg_finish_compress() will suffice for the
-trailer.)  This would not work for writing multi-scan JPEG files, but
-we simply do not intend to support that capability with suspension.
-
-
-*** Memory manager services ***
-
-The JPEG library's memory manager controls allocation and deallocation of
-memory, and it manages large "virtual" data arrays on machines where the
-operating system does not provide virtual memory.  Note that the same
-memory manager serves both compression and decompression operations.
-
-In all cases, allocated objects are tied to a particular compression or
-decompression master record, and they will be released when that master
-record is destroyed.
-
-The memory manager does not provide explicit deallocation of objects.
-Instead, objects are created in "pools" of free storage, and a whole pool
-can be freed at once.  This approach helps prevent storage-leak bugs, and
-it speeds up operations whenever malloc/free are slow (as they often are).
-The pools can be regarded as lifetime identifiers for objects.  Two
-pools/lifetimes are defined:
-  * JPOOL_PERMANENT	lasts until master record is destroyed
-  * JPOOL_IMAGE		lasts until done with image (JPEG datastream)
-Permanent lifetime is used for parameters and tables that should be carried
-across from one datastream to another; this includes all application-visible
-parameters.  Image lifetime is used for everything else.  (A third lifetime,
-JPOOL_PASS = one processing pass, was originally planned.  However it was
-dropped as not being worthwhile.  The actual usage patterns are such that the
-peak memory usage would be about the same anyway; and having per-pass storage
-substantially complicates the virtual memory allocation rules --- see below.)
-
-The memory manager deals with three kinds of object:
-1. "Small" objects.  Typically these require no more than 10K-20K total.
-2. "Large" objects.  These may require tens to hundreds of K depending on
-   image size.  Semantically they behave the same as small objects, but we
-   distinguish them for two reasons:
-     * On MS-DOS machines, large objects are referenced by FAR pointers,
-       small objects by NEAR pointers.
-     * Pool allocation heuristics may differ for large and small objects.
-   Note that individual "large" objects cannot exceed the size allowed by
-   type size_t, which may be 64K or less on some machines.
-3. "Virtual" objects.  These are large 2-D arrays of JSAMPLEs or JBLOCKs
-   (typically large enough for the entire image being processed).  The
-   memory manager provides stripwise access to these arrays.  On machines
-   without virtual memory, the rest of the array may be swapped out to a
-   temporary file.
-
-(Note: JSAMPARRAY and JBLOCKARRAY data structures are a combination of large
-objects for the data proper and small objects for the row pointers.  For
-convenience and speed, the memory manager provides single routines to create
-these structures.  Similarly, virtual arrays include a small control block
-and a JSAMPARRAY or JBLOCKARRAY working buffer, all created with one call.)
-
-In the present implementation, virtual arrays are only permitted to have image
-lifespan.  (Permanent lifespan would not be reasonable, and pass lifespan is
-not very useful since a virtual array's raison d'etre is to store data for
-multiple passes through the image.)  We also expect that only "small" objects
-will be given permanent lifespan, though this restriction is not required by
-the memory manager.
-
-In a non-virtual-memory machine, some performance benefit can be gained by
-making the in-memory buffers for virtual arrays be as large as possible.
-(For small images, the buffers might fit entirely in memory, so blind
-swapping would be very wasteful.)  The memory manager will adjust the height
-of the buffers to fit within a prespecified maximum memory usage.  In order
-to do this in a reasonably optimal fashion, the manager needs to allocate all
-of the virtual arrays at once.  Therefore, there isn't a one-step allocation
-routine for virtual arrays; instead, there is a "request" routine that simply
-allocates the control block, and a "realize" routine (called just once) that
-determines space allocation and creates all of the actual buffers.  The
-realize routine must allow for space occupied by non-virtual large objects.
-(We don't bother to factor in the space needed for small objects, on the
-grounds that it isn't worth the trouble.)
-
-To support all this, we establish the following protocol for doing business
-with the memory manager:
-  1. Modules must request virtual arrays (which may have only image lifespan)
-     during the initial setup phase, i.e., in their jinit_xxx routines.
-  2. All "large" objects (including JSAMPARRAYs and JBLOCKARRAYs) must also be
-     allocated during initial setup.
-  3. realize_virt_arrays will be called at the completion of initial setup.
-     The above conventions ensure that sufficient information is available
-     for it to choose a good size for virtual array buffers.
-Small objects of any lifespan may be allocated at any time.  We expect that
-the total space used for small objects will be small enough to be negligible
-in the realize_virt_arrays computation.
-
-In a virtual-memory machine, we simply pretend that the available space is
-infinite, thus causing realize_virt_arrays to decide that it can allocate all
-the virtual arrays as full-size in-memory buffers.  The overhead of the
-virtual-array access protocol is very small when no swapping occurs.
-
-A virtual array can be specified to be "pre-zeroed"; when this flag is set,
-never-yet-written sections of the array are set to zero before being made
-available to the caller.  If this flag is not set, never-written sections
-of the array contain garbage.  (This feature exists primarily because the
-equivalent logic would otherwise be needed in jdcoefct.c for progressive
-JPEG mode; we may as well make it available for possible other uses.)
-
-The first write pass on a virtual array is required to occur in top-to-bottom
-order; read passes, as well as any write passes after the first one, may
-access the array in any order.  This restriction exists partly to simplify
-the virtual array control logic, and partly because some file systems may not
-support seeking beyond the current end-of-file in a temporary file.  The main
-implication of this restriction is that rearrangement of rows (such as
-converting top-to-bottom data order to bottom-to-top) must be handled while
-reading data out of the virtual array, not while putting it in.
-
-
-*** Memory manager internal structure ***
-
-To isolate system dependencies as much as possible, we have broken the
-memory manager into two parts.  There is a reasonably system-independent
-"front end" (jmemmgr.c) and a "back end" that contains only the code
-likely to change across systems.  All of the memory management methods
-outlined above are implemented by the front end.  The back end provides
-the following routines for use by the front end (none of these routines
-are known to the rest of the JPEG code):
-
-jpeg_mem_init, jpeg_mem_term	system-dependent initialization/shutdown
-
-jpeg_get_small, jpeg_free_small	interface to malloc and free library routines
-				(or their equivalents)
-
-jpeg_get_large, jpeg_free_large	interface to FAR malloc/free in MSDOS machines;
-				else usually the same as
-				jpeg_get_small/jpeg_free_small
-
-jpeg_mem_available		estimate available memory
-
-jpeg_open_backing_store		create a backing-store object
-
-read_backing_store,		manipulate a backing-store object
-write_backing_store,
-close_backing_store
-
-On some systems there will be more than one type of backing-store object
-(specifically, in MS-DOS a backing store file might be an area of extended
-memory as well as a disk file).  jpeg_open_backing_store is responsible for
-choosing how to implement a given object.  The read/write/close routines
-are method pointers in the structure that describes a given object; this
-lets them be different for different object types.
-
-It may be necessary to ensure that backing store objects are explicitly
-released upon abnormal program termination.  For example, MS-DOS won't free
-extended memory by itself.  To support this, we will expect the main program
-or surrounding application to arrange to call self_destruct (typically via
-jpeg_destroy) upon abnormal termination.  This may require a SIGINT signal
-handler or equivalent.  We don't want to have the back end module install its
-own signal handler, because that would pre-empt the surrounding application's
-ability to control signal handling.
-
-The IJG distribution includes several memory manager back end implementations.
-Usually the same back end should be suitable for all applications on a given
-system, but it is possible for an application to supply its own back end at
-need.
-
-
-*** Implications of DNL marker ***
-
-Some JPEG files may use a DNL marker to postpone definition of the image
-height (this would be useful for a fax-like scanner's output, for instance).
-In these files the SOF marker claims the image height is 0, and you only
-find out the true image height at the end of the first scan.
-
-We could read these files as follows:
-1. Upon seeing zero image height, replace it by 65535 (the maximum allowed).
-2. When the DNL is found, update the image height in the global image
-   descriptor.
-This implies that control modules must avoid making copies of the image
-height, and must re-test for termination after each MCU row.  This would
-be easy enough to do.
-
-In cases where image-size data structures are allocated, this approach will
-result in very inefficient use of virtual memory or much-larger-than-necessary
-temporary files.  This seems acceptable for something that probably won't be a
-mainstream usage.  People might have to forgo use of memory-hogging options
-(such as two-pass color quantization or noninterleaved JPEG files) if they
-want efficient conversion of such files.  (One could improve efficiency by
-demanding a user-supplied upper bound for the height, less than 65536; in most
-cases it could be much less.)
-
-The standard also permits the SOF marker to overestimate the image height,
-with a DNL to give the true, smaller height at the end of the first scan.
-This would solve the space problems if the overestimate wasn't too great.
-However, it implies that you don't even know whether DNL will be used.
-
-This leads to a couple of very serious objections:
-1. Testing for a DNL marker must occur in the inner loop of the decompressor's
-   Huffman decoder; this implies a speed penalty whether the feature is used
-   or not.
-2. There is no way to hide the last-minute change in image height from an
-   application using the decoder.  Thus *every* application using the IJG
-   library would suffer a complexity penalty whether it cared about DNL or
-   not.
-We currently do not support DNL because of these problems.
-
-A different approach is to insist that DNL-using files be preprocessed by a
-separate program that reads ahead to the DNL, then goes back and fixes the SOF
-marker.  This is a much simpler solution and is probably far more efficient.
-Even if one wants piped input, buffering the first scan of the JPEG file needs
-a lot smaller temp file than is implied by the maximum-height method.  For
-this approach we'd simply treat DNL as a no-op in the decompressor (at most,
-check that it matches the SOF image height).
-
-We will not worry about making the compressor capable of outputting DNL.
-Something similar to the first scheme above could be applied if anyone ever
-wants to make that work.
diff --git a/jpeg/transupp.h b/jpeg/transupp.h
new file mode 100644
index 000000000000..7c16c19c440d
--- /dev/null
+++ b/jpeg/transupp.h
@@ -0,0 +1,210 @@
+/*
+ * transupp.h
+ *
+ * Copyright (C) 1997-2009, Thomas G. Lane, Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains declarations for image transformation routines and
+ * other utility code used by the jpegtran sample application.  These are
+ * NOT part of the core JPEG library.  But we keep these routines separate
+ * from jpegtran.c to ease the task of maintaining jpegtran-like programs
+ * that have other user interfaces.
+ *
+ * NOTE: all the routines declared here have very specific requirements
+ * about when they are to be executed during the reading and writing of the
+ * source and destination files.  See the comments in transupp.c, or see
+ * jpegtran.c for an example of correct usage.
+ */
+
+/* If you happen not to want the image transform support, disable it here */
+#ifndef TRANSFORMS_SUPPORTED
+#define TRANSFORMS_SUPPORTED 1		/* 0 disables transform code */
+#endif
+
+/*
+ * Although rotating and flipping data expressed as DCT coefficients is not
+ * hard, there is an asymmetry in the JPEG format specification for images
+ * whose dimensions aren't multiples of the iMCU size.  The right and bottom
+ * image edges are padded out to the next iMCU boundary with junk data; but
+ * no padding is possible at the top and left edges.  If we were to flip
+ * the whole image including the pad data, then pad garbage would become
+ * visible at the top and/or left, and real pixels would disappear into the
+ * pad margins --- perhaps permanently, since encoders & decoders may not
+ * bother to preserve DCT blocks that appear to be completely outside the
+ * nominal image area.  So, we have to exclude any partial iMCUs from the
+ * basic transformation.
+ *
+ * Transpose is the only transformation that can handle partial iMCUs at the
+ * right and bottom edges completely cleanly.  flip_h can flip partial iMCUs
+ * at the bottom, but leaves any partial iMCUs at the right edge untouched.
+ * Similarly flip_v leaves any partial iMCUs at the bottom edge untouched.
+ * The other transforms are defined as combinations of these basic transforms
+ * and process edge blocks in a way that preserves the equivalence.
+ *
+ * The "trim" option causes untransformable partial iMCUs to be dropped;
+ * this is not strictly lossless, but it usually gives the best-looking
+ * result for odd-size images.  Note that when this option is active,
+ * the expected mathematical equivalences between the transforms may not hold.
+ * (For example, -rot 270 -trim trims only the bottom edge, but -rot 90 -trim
+ * followed by -rot 180 -trim trims both edges.)
+ *
+ * We also offer a lossless-crop option, which discards data outside a given
+ * image region but losslessly preserves what is inside.  Like the rotate and
+ * flip transforms, lossless crop is restricted by the JPEG format: the upper
+ * left corner of the selected region must fall on an iMCU boundary.  If this
+ * does not hold for the given crop parameters, we silently move the upper left
+ * corner up and/or left to make it so, simultaneously increasing the region
+ * dimensions to keep the lower right crop corner unchanged.  (Thus, the
+ * output image covers at least the requested region, but may cover more.)
+ *
+ * We also provide a lossless-resize option, which is kind of a lossless-crop
+ * operation in the DCT coefficient block domain - it discards higher-order
+ * coefficients and losslessly preserves lower-order coefficients of a
+ * sub-block.
+ *
+ * Rotate/flip transform, resize, and crop can be requested together in a
+ * single invocation.  The crop is applied last --- that is, the crop region
+ * is specified in terms of the destination image after transform/resize.
+ *
+ * We also offer a "force to grayscale" option, which simply discards the
+ * chrominance channels of a YCbCr image.  This is lossless in the sense that
+ * the luminance channel is preserved exactly.  It's not the same kind of
+ * thing as the rotate/flip transformations, but it's convenient to handle it
+ * as part of this package, mainly because the transformation routines have to
+ * be aware of the option to know how many components to work on.
+ */
+
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jtransform_parse_crop_spec	jTrParCrop
+#define jtransform_request_workspace	jTrRequest
+#define jtransform_adjust_parameters	jTrAdjust
+#define jtransform_execute_transform	jTrExec
+#define jtransform_perfect_transform	jTrPerfect
+#define jcopy_markers_setup		jCMrkSetup
+#define jcopy_markers_execute		jCMrkExec
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+
+/*
+ * Codes for supported types of image transformations.
+ */
+
+typedef enum {
+	JXFORM_NONE,		/* no transformation */
+	JXFORM_FLIP_H,		/* horizontal flip */
+	JXFORM_FLIP_V,		/* vertical flip */
+	JXFORM_TRANSPOSE,	/* transpose across UL-to-LR axis */
+	JXFORM_TRANSVERSE,	/* transpose across UR-to-LL axis */
+	JXFORM_ROT_90,		/* 90-degree clockwise rotation */
+	JXFORM_ROT_180,		/* 180-degree rotation */
+	JXFORM_ROT_270		/* 270-degree clockwise (or 90 ccw) */
+} JXFORM_CODE;
+
+/*
+ * Codes for crop parameters, which can individually be unspecified,
+ * positive, or negative.  (Negative width or height makes no sense, though.)
+ */
+
+typedef enum {
+	JCROP_UNSET,
+	JCROP_POS,
+	JCROP_NEG
+} JCROP_CODE;
+
+/*
+ * Transform parameters struct.
+ * NB: application must not change any elements of this struct after
+ * calling jtransform_request_workspace.
+ */
+
+typedef struct {
+  /* Options: set by caller */
+  JXFORM_CODE transform;	/* image transform operator */
+  boolean perfect;		/* if TRUE, fail if partial MCUs are requested */
+  boolean trim;			/* if TRUE, trim partial MCUs as needed */
+  boolean force_grayscale;	/* if TRUE, convert color image to grayscale */
+  boolean crop;			/* if TRUE, crop source image */
+
+  /* Crop parameters: application need not set these unless crop is TRUE.
+   * These can be filled in by jtransform_parse_crop_spec().
+   */
+  JDIMENSION crop_width;	/* Width of selected region */
+  JCROP_CODE crop_width_set;
+  JDIMENSION crop_height;	/* Height of selected region */
+  JCROP_CODE crop_height_set;
+  JDIMENSION crop_xoffset;	/* X offset of selected region */
+  JCROP_CODE crop_xoffset_set;	/* (negative measures from right edge) */
+  JDIMENSION crop_yoffset;	/* Y offset of selected region */
+  JCROP_CODE crop_yoffset_set;	/* (negative measures from bottom edge) */
+
+  /* Internal workspace: caller should not touch these */
+  int num_components;		/* # of components in workspace */
+  jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */
+  JDIMENSION output_width;	/* cropped destination dimensions */
+  JDIMENSION output_height;
+  JDIMENSION x_crop_offset;	/* destination crop offsets measured in iMCUs */
+  JDIMENSION y_crop_offset;
+  int iMCU_sample_width;	/* destination iMCU size */
+  int iMCU_sample_height;
+} jpeg_transform_info;
+
+
+#if TRANSFORMS_SUPPORTED
+
+/* Parse a crop specification (written in X11 geometry style) */
+EXTERN(boolean) jtransform_parse_crop_spec
+	JPP((jpeg_transform_info *info, const char *spec));
+/* Request any required workspace */
+EXTERN(boolean) jtransform_request_workspace
+	JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info));
+/* Adjust output image parameters */
+EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters
+	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+	     jvirt_barray_ptr *src_coef_arrays,
+	     jpeg_transform_info *info));
+/* Execute the actual transformation, if any */
+EXTERN(void) jtransform_execute_transform
+	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+	     jvirt_barray_ptr *src_coef_arrays,
+	     jpeg_transform_info *info));
+/* Determine whether lossless transformation is perfectly
+ * possible for a specified image and transformation.
+ */
+EXTERN(boolean) jtransform_perfect_transform
+	JPP((JDIMENSION image_width, JDIMENSION image_height,
+	     int MCU_width, int MCU_height,
+	     JXFORM_CODE transform));
+
+/* jtransform_execute_transform used to be called
+ * jtransform_execute_transformation, but some compilers complain about
+ * routine names that long.  This macro is here to avoid breaking any
+ * old source code that uses the original name...
+ */
+#define jtransform_execute_transformation	jtransform_execute_transform
+
+#endif /* TRANSFORMS_SUPPORTED */
+
+
+/*
+ * Support for copying optional markers from source to destination file.
+ */
+
+typedef enum {
+	JCOPYOPT_NONE,		/* copy no optional markers */
+	JCOPYOPT_COMMENTS,	/* copy only comment (COM) markers */
+	JCOPYOPT_ALL		/* copy all optional markers */
+} JCOPY_OPTION;
+
+#define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS	/* recommended default */
+
+/* Setup decompression object to save desired markers in memory */
+EXTERN(void) jcopy_markers_setup
+	JPP((j_decompress_ptr srcinfo, JCOPY_OPTION option));
+/* Copy markers saved in the given source object to the destination object */
+EXTERN(void) jcopy_markers_execute
+	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+	     JCOPY_OPTION option));
diff --git a/jpeg/usage.doc b/jpeg/usage.doc
deleted file mode 100644
index 8c4970af0518..000000000000
--- a/jpeg/usage.doc
+++ /dev/null
@@ -1,562 +0,0 @@
-USAGE instructions for the Independent JPEG Group's JPEG software
-=================================================================
-
-This file describes usage of the JPEG conversion programs cjpeg and djpeg,
-as well as the utility programs jpegtran, rdjpgcom and wrjpgcom.  (See
-the other documentation files if you wish to use the JPEG library within
-your own programs.)
-
-If you are on a Unix machine you may prefer to read the Unix-style manual
-pages in files cjpeg.1, djpeg.1, jpegtran.1, rdjpgcom.1, wrjpgcom.1.
-
-
-INTRODUCTION
-
-These programs implement JPEG image compression and decompression.  JPEG
-(pronounced "jay-peg") is a standardized compression method for full-color
-and gray-scale images.  JPEG is designed to handle "real-world" scenes,
-for example scanned photographs.  Cartoons, line drawings, and other
-non-realistic images are not JPEG's strong suit; on that sort of material
-you may get poor image quality and/or little compression.
-
-JPEG is lossy, meaning that the output image is not necessarily identical to
-the input image.  Hence you should not use JPEG if you have to have identical
-output bits.  However, on typical real-world images, very good compression
-levels can be obtained with no visible change, and amazingly high compression
-is possible if you can tolerate a low-quality image.  You can trade off image
-quality against file size by adjusting the compressor's "quality" setting.
-
-
-GENERAL USAGE
-
-We provide two programs, cjpeg to compress an image file into JPEG format,
-and djpeg to decompress a JPEG file back into a conventional image format.
-
-On Unix-like systems, you say:
-	cjpeg [switches] [imagefile] >jpegfile
-or
-	djpeg [switches] [jpegfile]  >imagefile
-The programs read the specified input file, or standard input if none is
-named.  They always write to standard output (with trace/error messages to
-standard error).  These conventions are handy for piping images between
-programs.
-
-On most non-Unix systems, you say:
-	cjpeg [switches] imagefile jpegfile
-or
-	djpeg [switches] jpegfile  imagefile
-i.e., both the input and output files are named on the command line.  This
-style is a little more foolproof, and it loses no functionality if you don't
-have pipes.  (You can get this style on Unix too, if you prefer, by defining
-TWO_FILE_COMMANDLINE when you compile the programs; see install.doc.)
-
-You can also say:
-	cjpeg [switches] -outfile jpegfile  imagefile
-or
-	djpeg [switches] -outfile imagefile  jpegfile
-This syntax works on all systems, so it is useful for scripts.
-
-The currently supported image file formats are: PPM (PBMPLUS color format),
-PGM (PBMPLUS gray-scale format), BMP, Targa, and RLE (Utah Raster Toolkit
-format).  (RLE is supported only if the URT library is available.)
-cjpeg recognizes the input image format automatically, with the exception
-of some Targa-format files.  You have to tell djpeg which format to generate.
-
-JPEG files are in the defacto standard JFIF file format.  There are other,
-less widely used JPEG-based file formats, but we don't support them.
-
-All switch names may be abbreviated; for example, -grayscale may be written
--gray or -gr.  Most of the "basic" switches can be abbreviated to as little as
-one letter.  Upper and lower case are equivalent (-BMP is the same as -bmp).
-British spellings are also accepted (e.g., -greyscale), though for brevity
-these are not mentioned below.
-
-
-CJPEG DETAILS
-
-The basic command line switches for cjpeg are:
-
-	-quality N	Scale quantization tables to adjust image quality.
-			Quality is 0 (worst) to 100 (best); default is 75.
-			(See below for more info.)
-
-	-grayscale	Create monochrome JPEG file from color input.
-			Be sure to use this switch when compressing a grayscale
-			BMP file, because cjpeg isn't bright enough to notice
-			whether a BMP file uses only shades of gray.  By
-			saying -grayscale, you'll get a smaller JPEG file that
-			takes less time to process.
-
-	-optimize	Perform optimization of entropy encoding parameters.
-			Without this, default encoding parameters are used.
-			-optimize usually makes the JPEG file a little smaller,
-			but cjpeg runs somewhat slower and needs much more
-			memory.  Image quality and speed of decompression are
-			unaffected by -optimize.
-
-	-progressive	Create progressive JPEG file (see below).
-
-	-targa		Input file is Targa format.  Targa files that contain
-			an "identification" field will not be automatically
-			recognized by cjpeg; for such files you must specify
-			-targa to make cjpeg treat the input as Targa format.
-			For most Targa files, you won't need this switch.
-
-The -quality switch lets you trade off compressed file size against quality of
-the reconstructed image: the higher the quality setting, the larger the JPEG
-file, and the closer the output image will be to the original input.  Normally
-you want to use the lowest quality setting (smallest file) that decompresses
-into something visually indistinguishable from the original image.  For this
-purpose the quality setting should be between 50 and 95; the default of 75 is
-often about right.  If you see defects at -quality 75, then go up 5 or 10
-counts at a time until you are happy with the output image.  (The optimal
-setting will vary from one image to another.)
-
--quality 100 will generate a quantization table of all 1's, minimizing loss
-in the quantization step (but there is still information loss in subsampling,
-as well as roundoff error).  This setting is mainly of interest for
-experimental purposes.  Quality values above about 95 are NOT recommended for
-normal use; the compressed file size goes up dramatically for hardly any gain
-in output image quality.
-
-In the other direction, quality values below 50 will produce very small files
-of low image quality.  Settings around 5 to 10 might be useful in preparing an
-index of a large image library, for example.  Try -quality 2 (or so) for some
-amusing Cubist effects.  (Note: quality values below about 25 generate 2-byte
-quantization tables, which are considered optional in the JPEG standard.
-cjpeg emits a warning message when you give such a quality value, because some
-other JPEG programs may be unable to decode the resulting file.  Use -baseline
-if you need to ensure compatibility at low quality values.)
-
-The -progressive switch creates a "progressive JPEG" file.  In this type of
-JPEG file, the data is stored in multiple scans of increasing quality.  If the
-file is being transmitted over a slow communications link, the decoder can use
-the first scan to display a low-quality image very quickly, and can then
-improve the display with each subsequent scan.  The final image is exactly
-equivalent to a standard JPEG file of the same quality setting, and the total
-file size is about the same --- often a little smaller.  CAUTION: progressive
-JPEG is not yet widely implemented, so many decoders will be unable to view a
-progressive JPEG file at all.
-
-Switches for advanced users:
-
-	-dct int	Use integer DCT method (default).
-	-dct fast	Use fast integer DCT (less accurate).
-	-dct float	Use floating-point DCT method.
-			The float method is very slightly more accurate than
-			the int method, but is much slower unless your machine
-			has very fast floating-point hardware.  Also note that
-			results of the floating-point method may vary slightly
-			across machines, while the integer methods should give
-			the same results everywhere.  The fast integer method
-			is much less accurate than the other two.
-
-	-restart N	Emit a JPEG restart marker every N MCU rows, or every
-			N MCU blocks if "B" is attached to the number.
-			-restart 0 (the default) means no restart markers.
-
-	-smooth N	Smooth the input image to eliminate dithering noise.
-			N, ranging from 1 to 100, indicates the strength of
-			smoothing.  0 (the default) means no smoothing.
-
-	-maxmemory N	Set limit for amount of memory to use in processing
-			large images.  Value is in thousands of bytes, or
-			millions of bytes if "M" is attached to the number.
-			For example, -max 4m selects 4000000 bytes.  If more
-			space is needed, temporary files will be used.
-
-	-verbose	Enable debug printout.  More -v's give more printout.
-	or  -debug	Also, version information is printed at startup.
-
-The -restart option inserts extra markers that allow a JPEG decoder to
-resynchronize after a transmission error.  Without restart markers, any damage
-to a compressed file will usually ruin the image from the point of the error
-to the end of the image; with restart markers, the damage is usually confined
-to the portion of the image up to the next restart marker.  Of course, the
-restart markers occupy extra space.  We recommend -restart 1 for images that
-will be transmitted across unreliable networks such as Usenet.
-
-The -smooth option filters the input to eliminate fine-scale noise.  This is
-often useful when converting dithered images to JPEG: a moderate smoothing
-factor of 10 to 50 gets rid of dithering patterns in the input file, resulting
-in a smaller JPEG file and a better-looking image.  Too large a smoothing
-factor will visibly blur the image, however.
-
-Switches for wizards:
-
-	-baseline	Force baseline-compatible quantization tables to be
-			generated.  This clamps quantization values to 8 bits
-			even at low quality settings.  (This switch is poorly
-			named, since it does not ensure that the output is
-			actually baseline JPEG.  For example, you can use
-			-baseline and -progressive together.)
-
-	-qtables file	Use the quantization tables given in the specified
-			text file.
-
-	-qslots N[,...] Select which quantization table to use for each color
-			component.
-
-	-sample HxV[,...]  Set JPEG sampling factors for each color component.
-
-	-scans file	Use the scan script given in the specified text file.
-
-The "wizard" switches are intended for experimentation with JPEG.  If you
-don't know what you are doing, DON'T USE THEM.  These switches are documented
-further in the file wizard.doc.
-
-
-DJPEG DETAILS
-
-The basic command line switches for djpeg are:
-
-	-colors N	Reduce image to at most N colors.  This reduces the
-	or -quantize N	number of colors used in the output image, so that it
-			can be displayed on a colormapped display or stored in
-			a colormapped file format.  For example, if you have
-			an 8-bit display, you'd need to reduce to 256 or fewer
-			colors.  (-colors is the recommended name, -quantize
-			is provided only for backwards compatibility.)
-
-	-fast		Select recommended processing options for fast, low
-			quality output.  (The default options are chosen for
-			highest quality output.)  Currently, this is equivalent
-			to "-dct fast -nosmooth -onepass -dither ordered".
-
-	-grayscale	Force gray-scale output even if JPEG file is color.
-			Useful for viewing on monochrome displays; also,
-			djpeg runs noticeably faster in this mode.
-
-	-scale M/N	Scale the output image by a factor M/N.  Currently
-			the scale factor must be 1/1, 1/2, 1/4, or 1/8.
-			Scaling is handy if the image is larger than your
-			screen; also, djpeg runs much faster when scaling
-			down the output.
-
-	-bmp		Select BMP output format (Windows flavor).  8-bit
-			colormapped format is emitted if -colors or -grayscale
-			is specified, or if the JPEG file is gray-scale;
-			otherwise, 24-bit full-color format is emitted.
-
-	-gif		Select GIF output format.  Since GIF does not support
-			more than 256 colors, -colors 256 is assumed (unless
-			you specify a smaller number of colors).  If you
-			specify -fast, the default number of colors is 216.
-
-	-os2		Select BMP output format (OS/2 1.x flavor).  8-bit
-			colormapped format is emitted if -colors or -grayscale
-			is specified, or if the JPEG file is gray-scale;
-			otherwise, 24-bit full-color format is emitted.
-
-	-pnm		Select PBMPLUS (PPM/PGM) output format (this is the
-			default format).  PGM is emitted if the JPEG file is
-			gray-scale or if -grayscale is specified; otherwise
-			PPM is emitted.
-
-	-rle		Select RLE output format.  (Requires URT library.)
-
-	-targa		Select Targa output format.  Gray-scale format is
-			emitted if the JPEG file is gray-scale or if
-			-grayscale is specified; otherwise, colormapped format
-			is emitted if -colors is specified; otherwise, 24-bit
-			full-color format is emitted.
-
-Switches for advanced users:
-
-	-dct int	Use integer DCT method (default).
-	-dct fast	Use fast integer DCT (less accurate).
-	-dct float	Use floating-point DCT method.
-			The float method is very slightly more accurate than
-			the int method, but is much slower unless your machine
-			has very fast floating-point hardware.  Also note that
-			results of the floating-point method may vary slightly
-			across machines, while the integer methods should give
-			the same results everywhere.  The fast integer method
-			is much less accurate than the other two.
-
-	-dither fs	Use Floyd-Steinberg dithering in color quantization.
-	-dither ordered	Use ordered dithering in color quantization.
-	-dither none	Do not use dithering in color quantization.
-			By default, Floyd-Steinberg dithering is applied when
-			quantizing colors; this is slow but usually produces
-			the best results.  Ordered dither is a compromise
-			between speed and quality; no dithering is fast but
-			usually looks awful.  Note that these switches have
-			no effect unless color quantization is being done.
-			Ordered dither is only available in -onepass mode.
-
-	-map FILE	Quantize to the colors used in the specified image
-			file.  This is useful for producing multiple files
-			with identical color maps, or for forcing a predefined
-			set of colors to be used.  The FILE must be a GIF
-			or PPM file.  This option overrides -colors and
-			-onepass.
-
-	-nosmooth	Use a faster, lower-quality upsampling routine.
-
-	-onepass	Use one-pass instead of two-pass color quantization.
-			The one-pass method is faster and needs less memory,
-			but it produces a lower-quality image.  -onepass is
-			ignored unless you also say -colors N.  Also,
-			the one-pass method is always used for gray-scale
-			output (the two-pass method is no improvement then).
-
-	-maxmemory N	Set limit for amount of memory to use in processing
-			large images.  Value is in thousands of bytes, or
-			millions of bytes if "M" is attached to the number.
-			For example, -max 4m selects 4000000 bytes.  If more
-			space is needed, temporary files will be used.
-
-	-verbose	Enable debug printout.  More -v's give more printout.
-	or  -debug	Also, version information is printed at startup.
-
-
-HINTS FOR CJPEG
-
-Color GIF files are not the ideal input for JPEG; JPEG is really intended for
-compressing full-color (24-bit) images.  In particular, don't try to convert
-cartoons, line drawings, and other images that have only a few distinct
-colors.  GIF works great on these, JPEG does not.  If you want to convert a
-GIF to JPEG, you should experiment with cjpeg's -quality and -smooth options
-to get a satisfactory conversion.  -smooth 10 or so is often helpful.
-
-Avoid running an image through a series of JPEG compression/decompression
-cycles.  Image quality loss will accumulate; after ten or so cycles the image
-may be noticeably worse than it was after one cycle.  It's best to use a
-lossless format while manipulating an image, then convert to JPEG format when
-you are ready to file the image away.
-
-The -optimize option to cjpeg is worth using when you are making a "final"
-version for posting or archiving.  It's also a win when you are using low
-quality settings to make very small JPEG files; the percentage improvement
-is often a lot more than it is on larger files.  (At present, -optimize
-mode is always selected when generating progressive JPEG files.)
-
-GIF input files are no longer supported, to avoid the Unisys LZW patent.
-Use a Unisys-licensed program if you need to read a GIF file.  (Conversion
-of GIF files to JPEG is usually a bad idea anyway.)
-
-
-HINTS FOR DJPEG
-
-To get a quick preview of an image, use the -grayscale and/or -scale switches.
-"-grayscale -scale 1/8" is the fastest case.
-
-Several options are available that trade off image quality to gain speed.
-"-fast" turns on the recommended settings.
-
-"-dct fast" and/or "-nosmooth" gain speed at a small sacrifice in quality.
-When producing a color-quantized image, "-onepass -dither ordered" is fast but
-much lower quality than the default behavior.  "-dither none" may give
-acceptable results in two-pass mode, but is seldom tolerable in one-pass mode.
-
-If you are fortunate enough to have very fast floating point hardware,
-"-dct float" may be even faster than "-dct fast".  But on most machines
-"-dct float" is slower than "-dct int"; in this case it is not worth using,
-because its theoretical accuracy advantage is too small to be significant
-in practice.
-
-Two-pass color quantization requires a good deal of memory; on MS-DOS machines
-it may run out of memory even with -maxmemory 0.  In that case you can still
-decompress, with some loss of image quality, by specifying -onepass for
-one-pass quantization.
-
-To avoid the Unisys LZW patent, djpeg produces uncompressed GIF files.  These
-are larger than they should be, but are readable by standard GIF decoders.
-
-
-HINTS FOR BOTH PROGRAMS
-
-If more space is needed than will fit in the available main memory (as
-determined by -maxmemory), temporary files will be used.  (MS-DOS versions
-will try to get extended or expanded memory first.)  The temporary files are
-often rather large: in typical cases they occupy three bytes per pixel, for
-example 3*800*600 = 1.44Mb for an 800x600 image.  If you don't have enough
-free disk space, leave out -progressive and -optimize (for cjpeg) or specify
--onepass (for djpeg).
-
-On MS-DOS, the temporary files are created in the directory named by the TMP
-or TEMP environment variable, or in the current directory if neither of those
-exist.  Amiga implementations put the temp files in the directory named by
-JPEGTMP:, so be sure to assign JPEGTMP: to a disk partition with adequate free
-space.
-
-The default memory usage limit (-maxmemory) is set when the software is
-compiled.  If you get an "insufficient memory" error, try specifying a smaller
--maxmemory value, even -maxmemory 0 to use the absolute minimum space.  You
-may want to recompile with a smaller default value if this happens often.
-
-On machines that have "environment" variables, you can define the environment
-variable JPEGMEM to set the default memory limit.  The value is specified as
-described for the -maxmemory switch.  JPEGMEM overrides the default value
-specified when the program was compiled, and itself is overridden by an
-explicit -maxmemory switch.
-
-On MS-DOS machines, -maxmemory is the amount of main (conventional) memory to
-use.  (Extended or expanded memory is also used if available.)  Most
-DOS-specific versions of this software do their own memory space estimation
-and do not need you to specify -maxmemory.
-
-
-JPEGTRAN
-
-jpegtran performs various useful transformations of JPEG files.
-It can translate the coded representation from one variant of JPEG to another,
-for example from baseline JPEG to progressive JPEG or vice versa.  It can also
-perform some rearrangements of the image data, for example turning an image
-from landscape to portrait format by rotation.
-
-jpegtran works by rearranging the compressed data (DCT coefficients), without
-ever fully decoding the image.  Therefore, its transformations are lossless:
-there is no image degradation at all, which would not be true if you used
-djpeg followed by cjpeg to accomplish the same conversion.  But by the same
-token, jpegtran cannot perform lossy operations such as changing the image
-quality.
-
-jpegtran uses a command line syntax similar to cjpeg or djpeg.
-On Unix-like systems, you say:
-	jpegtran [switches] [inputfile] >outputfile
-On most non-Unix systems, you say:
-	jpegtran [switches] inputfile outputfile
-where both the input and output files are JPEG files.
-
-To specify the coded JPEG representation used in the output file,
-jpegtran accepts a subset of the switches recognized by cjpeg:
-	-optimize	Perform optimization of entropy encoding parameters.
-	-progressive	Create progressive JPEG file.
-	-restart N	Emit a JPEG restart marker every N MCU rows, or every
-			N MCU blocks if "B" is attached to the number.
-	-scans file	Use the scan script given in the specified text file.
-See the previous discussion of cjpeg for more details about these switches.
-If you specify none of these switches, you get a plain baseline-JPEG output
-file.  The quality setting and so forth are determined by the input file.
-
-The image can be losslessly transformed by giving one of these switches:
-	-flip horizontal	Mirror image horizontally (left-right).
-	-flip vertical		Mirror image vertically (top-bottom).
-	-rotate 90		Rotate image 90 degrees clockwise.
-	-rotate 180		Rotate image 180 degrees.
-	-rotate 270		Rotate image 270 degrees clockwise (or 90 ccw).
-	-transpose		Transpose image (across UL-to-LR axis).
-	-transverse		Transverse transpose (across UR-to-LL axis).
-
-The transpose transformation has no restrictions regarding image dimensions.
-The other transformations operate rather oddly if the image dimensions are not
-a multiple of the iMCU size (usually 8 or 16 pixels), because they can only
-transform complete blocks of DCT coefficient data in the desired way.
-
-jpegtran's default behavior when transforming an odd-size image is designed
-to preserve exact reversibility and mathematical consistency of the
-transformation set.  As stated, transpose is able to flip the entire image
-area.  Horizontal mirroring leaves any partial iMCU column at the right edge
-untouched, but is able to flip all rows of the image.  Similarly, vertical
-mirroring leaves any partial iMCU row at the bottom edge untouched, but is
-able to flip all columns.  The other transforms can be built up as sequences
-of transpose and flip operations; for consistency, their actions on edge
-pixels are defined to be the same as the end result of the corresponding
-transpose-and-flip sequence.
-
-For practical use, you may prefer to discard any untransformable edge pixels
-rather than having a strange-looking strip along the right and/or bottom edges
-of a transformed image.  To do this, add the -trim switch:
-	-trim		Drop non-transformable edge blocks.
-Obviously, a transformation with -trim is not reversible, so strictly speaking
-jpegtran with this switch is not lossless.  Also, the expected mathematical
-equivalences between the transformations no longer hold.  For example,
-"-rot 270 -trim" trims only the bottom edge, but "-rot 90 -trim" followed by
-"-rot 180 -trim" trims both edges.
-
-Another not-strictly-lossless transformation switch is:
-	-grayscale	Force grayscale output.
-This option discards the chrominance channels if the input image is YCbCr
-(ie, a standard color JPEG), resulting in a grayscale JPEG file.  The
-luminance channel is preserved exactly, so this is a better method of reducing
-to grayscale than decompression, conversion, and recompression.  This switch
-is particularly handy for fixing a monochrome picture that was mistakenly
-encoded as a color JPEG.  (In such a case, the space savings from getting rid
-of the near-empty chroma channels won't be large; but the decoding time for
-a grayscale JPEG is substantially less than that for a color JPEG.)
-
-jpegtran also recognizes these switches that control what to do with "extra"
-markers, such as comment blocks:
-	-copy none	Copy no extra markers from source file.  This setting
-			suppresses all comments and other excess baggage
-			present in the source file.
-	-copy comments	Copy only comment markers.  This setting copies
-			comments from the source file, but discards
-			any other inessential data. 
-	-copy all	Copy all extra markers.  This setting preserves
-			miscellaneous markers found in the source file, such
-			as JFIF thumbnails and Photoshop settings.  In some
-			files these extra markers can be sizable.
-The default behavior is -copy comments.  (Note: in IJG releases v6 and v6a,
-jpegtran always did the equivalent of -copy none.)
-
-Additional switches recognized by jpegtran are:
-	-outfile filename
-	-maxmemory N
-	-verbose
-	-debug
-These work the same as in cjpeg or djpeg.
-
-
-THE COMMENT UTILITIES
-
-The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
-Although the standard doesn't actually define what COM blocks are for, they
-are widely used to hold user-supplied text strings.  This lets you add
-annotations, titles, index terms, etc to your JPEG files, and later retrieve
-them as text.  COM blocks do not interfere with the image stored in the JPEG
-file.  The maximum size of a COM block is 64K, but you can have as many of
-them as you like in one JPEG file.
-
-We provide two utility programs to display COM block contents and add COM
-blocks to a JPEG file.
-
-rdjpgcom searches a JPEG file and prints the contents of any COM blocks on
-standard output.  The command line syntax is
-	rdjpgcom [-verbose] [inputfilename]
-The switch "-verbose" (or just "-v") causes rdjpgcom to also display the JPEG
-image dimensions.  If you omit the input file name from the command line,
-the JPEG file is read from standard input.  (This may not work on some
-operating systems, if binary data can't be read from stdin.)
-
-wrjpgcom adds a COM block, containing text you provide, to a JPEG file.
-Ordinarily, the COM block is added after any existing COM blocks, but you
-can delete the old COM blocks if you wish.  wrjpgcom produces a new JPEG
-file; it does not modify the input file.  DO NOT try to overwrite the input
-file by directing wrjpgcom's output back into it; on most systems this will
-just destroy your file.
-
-The command line syntax for wrjpgcom is similar to cjpeg's.  On Unix-like
-systems, it is
-	wrjpgcom [switches] [inputfilename]
-The output file is written to standard output.  The input file comes from
-the named file, or from standard input if no input file is named.
-
-On most non-Unix systems, the syntax is
-	wrjpgcom [switches] inputfilename outputfilename
-where both input and output file names must be given explicitly.
-
-wrjpgcom understands three switches:
-	-replace		 Delete any existing COM blocks from the file.
-	-comment "Comment text"	 Supply new COM text on command line.
-	-cfile name		 Read text for new COM block from named file.
-(Switch names can be abbreviated.)  If you have only one line of comment text
-to add, you can provide it on the command line with -comment.  The comment
-text must be surrounded with quotes so that it is treated as a single
-argument.  Longer comments can be read from a text file.
-
-If you give neither -comment nor -cfile, then wrjpgcom will read the comment
-text from standard input.  (In this case an input image file name MUST be
-supplied, so that the source JPEG file comes from somewhere else.)  You can
-enter multiple lines, up to 64KB worth.  Type an end-of-file indicator
-(usually control-D or control-Z) to terminate the comment text entry.
-
-wrjpgcom will not add a COM block if the provided comment string is empty.
-Therefore -replace -comment "" can be used to delete all COM blocks from a
-file.
-
-These utility programs do not depend on the IJG JPEG library.  In
-particular, the source code for rdjpgcom is intended as an illustration of
-the minimum amount of code required to parse a JPEG file header correctly.
diff --git a/jpeg/wizard.doc b/jpeg/wizard.doc
deleted file mode 100644
index 54170b227df0..000000000000
--- a/jpeg/wizard.doc
+++ /dev/null
@@ -1,211 +0,0 @@
-Advanced usage instructions for the Independent JPEG Group's JPEG software
-==========================================================================
-
-This file describes cjpeg's "switches for wizards".
-
-The "wizard" switches are intended for experimentation with JPEG by persons
-who are reasonably knowledgeable about the JPEG standard.  If you don't know
-what you are doing, DON'T USE THESE SWITCHES.  You'll likely produce files
-with worse image quality and/or poorer compression than you'd get from the
-default settings.  Furthermore, these switches must be used with caution
-when making files intended for general use, because not all JPEG decoders
-will support unusual JPEG parameter settings.
-
-
-Quantization Table Adjustment
------------------------------
-
-Ordinarily, cjpeg starts with a default set of tables (the same ones given
-as examples in the JPEG standard) and scales them up or down according to
-the -quality setting.  The details of the scaling algorithm can be found in
-jcparam.c.  At very low quality settings, some quantization table entries
-can get scaled up to values exceeding 255.  Although 2-byte quantization
-values are supported by the IJG software, this feature is not in baseline
-JPEG and is not supported by all implementations.  If you need to ensure
-wide compatibility of low-quality files, you can constrain the scaled
-quantization values to no more than 255 by giving the -baseline switch.
-Note that use of -baseline will result in poorer quality for the same file
-size, since more bits than necessary are expended on higher AC coefficients.
-
-You can substitute a different set of quantization values by using the
--qtables switch:
-
-	-qtables file	Use the quantization tables given in the named file.
-
-The specified file should be a text file containing decimal quantization
-values.  The file should contain one to four tables, each of 64 elements.
-The tables are implicitly numbered 0,1,etc. in order of appearance.  Table
-entries appear in normal array order (NOT in the zigzag order in which they
-will be stored in the JPEG file).
-
-Quantization table files are free format, in that arbitrary whitespace can
-appear between numbers.  Also, comments can be included: a comment starts
-with '#' and extends to the end of the line.  Here is an example file that
-duplicates the default quantization tables:
-
-	# Quantization tables given in JPEG spec, section K.1
-
-	# This is table 0 (the luminance table):
-	  16  11  10  16  24  40  51  61
-	  12  12  14  19  26  58  60  55
-	  14  13  16  24  40  57  69  56
-	  14  17  22  29  51  87  80  62
-	  18  22  37  56  68 109 103  77
-	  24  35  55  64  81 104 113  92
-	  49  64  78  87 103 121 120 101
-	  72  92  95  98 112 100 103  99
-
-	# This is table 1 (the chrominance table):
-	  17  18  24  47  99  99  99  99
-	  18  21  26  66  99  99  99  99
-	  24  26  56  99  99  99  99  99
-	  47  66  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-	  99  99  99  99  99  99  99  99
-
-If the -qtables switch is used without -quality, then the specified tables
-are used exactly as-is.  If both -qtables and -quality are used, then the
-tables taken from the file are scaled in the same fashion that the default
-tables would be scaled for that quality setting.  If -baseline appears, then
-the quantization values are constrained to the range 1-255.
-
-By default, cjpeg will use quantization table 0 for luminance components and
-table 1 for chrominance components.  To override this choice, use the -qslots
-switch:
-
-	-qslots N[,...]		Select which quantization table to use for
-				each color component.
-
-The -qslots switch specifies a quantization table number for each color
-component, in the order in which the components appear in the JPEG SOF marker.
-For example, to create a separate table for each of Y,Cb,Cr, you could
-provide a -qtables file that defines three quantization tables and say
-"-qslots 0,1,2".  If -qslots gives fewer table numbers than there are color
-components, then the last table number is repeated as necessary.
-
-
-Sampling Factor Adjustment
---------------------------
-
-By default, cjpeg uses 2:1 horizontal and vertical downsampling when
-compressing YCbCr data, and no downsampling for all other color spaces.
-You can override this default with the -sample switch:
-
-	-sample HxV[,...]	Set JPEG sampling factors for each color
-				component.
-
-The -sample switch specifies the JPEG sampling factors for each color
-component, in the order in which they appear in the JPEG SOF marker.
-If you specify fewer HxV pairs than there are components, the remaining
-components are set to 1x1 sampling.  For example, the default YCbCr setting
-is equivalent to "-sample 2x2,1x1,1x1", which can be abbreviated to
-"-sample 2x2".
-
-There are still some JPEG decoders in existence that support only 2x1
-sampling (also called 4:2:2 sampling).  Compatibility with such decoders can
-be achieved by specifying "-sample 2x1".  This is not recommended unless
-really necessary, since it increases file size and encoding/decoding time
-with very little quality gain.
-
-
-Multiple Scan / Progression Control
------------------------------------
-
-By default, cjpeg emits a single-scan sequential JPEG file.  The
--progressive switch generates a progressive JPEG file using a default series
-of progression parameters.  You can create multiple-scan sequential JPEG
-files or progressive JPEG files with custom progression parameters by using
-the -scans switch:
-
-	-scans file	Use the scan sequence given in the named file.
-
-The specified file should be a text file containing a "scan script".
-The script specifies the contents and ordering of the scans to be emitted.
-Each entry in the script defines one scan.  A scan definition specifies
-the components to be included in the scan, and for progressive JPEG it also
-specifies the progression parameters Ss,Se,Ah,Al for the scan.  Scan
-definitions are separated by semicolons (';').  A semicolon after the last
-scan definition is optional.
-
-Each scan definition contains one to four component indexes, optionally
-followed by a colon (':') and the four progressive-JPEG parameters.  The
-component indexes denote which color component(s) are to be transmitted in
-the scan.  Components are numbered in the order in which they appear in the
-JPEG SOF marker, with the first component being numbered 0.  (Note that these
-indexes are not the "component ID" codes assigned to the components, just
-positional indexes.)
-
-The progression parameters for each scan are:
-	Ss	Zigzag index of first coefficient included in scan
-	Se	Zigzag index of last coefficient included in scan
-	Ah	Zero for first scan of a coefficient, else Al of prior scan
-	Al	Successive approximation low bit position for scan
-If the progression parameters are omitted, the values 0,63,0,0 are used,
-producing a sequential JPEG file.  cjpeg automatically determines whether
-the script represents a progressive or sequential file, by observing whether
-Ss and Se values other than 0 and 63 appear.  (The -progressive switch is
-not needed to specify this; in fact, it is ignored when -scans appears.)
-The scan script must meet the JPEG restrictions on progression sequences.
-(cjpeg checks that the spec's requirements are obeyed.)
-
-Scan script files are free format, in that arbitrary whitespace can appear
-between numbers and around punctuation.  Also, comments can be included: a
-comment starts with '#' and extends to the end of the line.  For additional
-legibility, commas or dashes can be placed between values.  (Actually, any
-single punctuation character other than ':' or ';' can be inserted.)  For
-example, the following two scan definitions are equivalent:
-	0 1 2: 0 63 0 0;
-	0,1,2 : 0-63, 0,0 ;
-
-Here is an example of a scan script that generates a partially interleaved
-sequential JPEG file:
-
-	0;			# Y only in first scan
-	1 2;			# Cb and Cr in second scan
-
-Here is an example of a progressive scan script using only spectral selection
-(no successive approximation):
-
-	# Interleaved DC scan for Y,Cb,Cr:
-	0,1,2: 0-0,   0, 0 ;
-	# AC scans:
-	0:     1-2,   0, 0 ;	# First two Y AC coefficients
-	0:     3-5,   0, 0 ;	# Three more
-	1:     1-63,  0, 0 ;	# All AC coefficients for Cb
-	2:     1-63,  0, 0 ;	# All AC coefficients for Cr
-	0:     6-9,   0, 0 ;	# More Y coefficients
-	0:     10-63, 0, 0 ;	# Remaining Y coefficients
-
-Here is an example of a successive-approximation script.  This is equivalent
-to the default script used by "cjpeg -progressive" for YCbCr images:
-
-	# Initial DC scan for Y,Cb,Cr (lowest bit not sent)
-	0,1,2: 0-0,   0, 1 ;
-	# First AC scan: send first 5 Y AC coefficients, minus 2 lowest bits:
-	0:     1-5,   0, 2 ;
-	# Send all Cr,Cb AC coefficients, minus lowest bit:
-	# (chroma data is usually too small to be worth subdividing further;
-	#  but note we send Cr first since eye is least sensitive to Cb)
-	2:     1-63,  0, 1 ;
-	1:     1-63,  0, 1 ;
-	# Send remaining Y AC coefficients, minus 2 lowest bits:
-	0:     6-63,  0, 2 ;
-	# Send next-to-lowest bit of all Y AC coefficients:
-	0:     1-63,  2, 1 ;
-	# At this point we've sent all but the lowest bit of all coefficients.
-	# Send lowest bit of DC coefficients
-	0,1,2: 0-0,   1, 0 ;
-	# Send lowest bit of AC coefficients
-	2:     1-63,  1, 0 ;
-	1:     1-63,  1, 0 ;
-	# Y AC lowest bit scan is last; it's usually the largest scan
-	0:     1-63,  1, 0 ;
-
-It may be worth pointing out that this script is tuned for quality settings
-of around 50 to 75.  For lower quality settings, you'd probably want to use
-a script with fewer stages of successive approximation (otherwise the
-initial scans will be really bad).  For higher quality settings, you might
-want to use more stages of successive approximation (so that the initial
-scans are not too large).
-- 
2.11.4.GIT