Initial softfloat port from Bochs to 86box, currently selectable only on QT.

2023-04-29 18:56:57 +02:00
parent 071c05e65f
commit 7a53e1de45
44 changed files with 16934 additions and 115 deletions
--- a/src/cpu/softfloat/CMakeLists.txt
+++ b/src/cpu/softfloat/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# 86Box    A hypervisor and IBM PC system emulator that specializes in
+#          running old operating systems and software designed for IBM
+#          PC systems and compatibles from 1981 through fairly recent
+#          system designs based on the PCI bus.
+#
+#          This file is part of the 86Box distribution.
+#
+#          CMake build script.
+#
+# Authors: David Hrdlička, <hrdlickadavid@outlook.com>
+#
+#          Copyright 2020-2021 David Hrdlička.
+#
+
+add_library(softfloat OBJECT f2xm1.cc fpatan.cc fprem.cc fsincos.cc fyl2x.cc poly.cc softfloat.cc softfloat16.cc
+		softfloat-muladd.cc softfloat-round-pack.cc softfloat-specialize.cc softfloatx80.cc)
--- a/src/cpu/softfloat/config.h
+++ b/src/cpu/softfloat/config.h
@@ -0,0 +1,46 @@
+#include <stdint.h>
+
+typedef int8_t flag;
+typedef uint8_t uint8;
+typedef int8_t int8;
+typedef uint16_t uint16;
+typedef int16_t int16;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint64_t uint64;
+typedef int64_t int64;
+
+/*----------------------------------------------------------------------------
+| Each of the following `typedef's defines a type that holds integers
+| of _exactly_ the number of bits specified.  For instance, for most
+| implementation of C, `bits16' and `sbits16' should be `typedef'ed to
+| `unsigned short int' and `signed short int' (or `short int'), respectively.
+*----------------------------------------------------------------------------*/
+typedef uint8_t bits8;
+typedef int8_t sbits8;
+typedef uint16_t bits16;
+typedef int16_t sbits16;
+typedef uint32_t bits32;
+typedef int32_t sbits32;
+typedef uint64_t bits64;
+typedef int64_t sbits64;
+
+typedef uint8_t Bit8u;
+typedef int8_t Bit8s;
+typedef uint16_t Bit16u;
+typedef int16_t Bit16s;
+typedef uint32_t Bit32u;
+typedef int32_t Bit32s;
+typedef uint64_t Bit64u;
+typedef int64_t Bit64s;
+
+/*----------------------------------------------------------------------------
+| The `LIT64' macro takes as its argument a textual integer literal and
+| if necessary ``marks'' the literal as having a 64-bit integer type.
+| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be
+| appended with the letters `LL' standing for `long long', which is `gcc's
+| name for the 64-bit integer type.  Some compilers may allow `LIT64' to be
+| defined as the identity macro:  `#define LIT64( a ) a'.
+*----------------------------------------------------------------------------*/
+#define BX_CONST64(a) a##LL
+#define BX_CPP_INLINE static __inline
--- a/src/cpu/softfloat/f2xm1.cc
+++ b/src/cpu/softfloat/f2xm1.cc
@@ -0,0 +1,182 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#define FLOAT128
+
+#include "softfloatx80.h"
+#include "softfloat-round-pack.h"
+
+static const floatx80 floatx80_negone  = packFloatx80(1, 0x3fff, BX_CONST64(0x8000000000000000));
+static const floatx80 floatx80_neghalf = packFloatx80(1, 0x3ffe, BX_CONST64(0x8000000000000000));
+static const float128 float128_ln2     =
+    packFloat128(BX_CONST64(0x3ffe62e42fefa39e), BX_CONST64(0xf35793c7673007e6));
+
+#ifdef BETTER_THAN_PENTIUM
+
+#define LN2_SIG_HI BX_CONST64(0xb17217f7d1cf79ab)
+#define LN2_SIG_LO BX_CONST64(0xc9e3b39800000000)  /* 96 bit precision */
+
+#else
+
+#define LN2_SIG_HI BX_CONST64(0xb17217f7d1cf79ab)
+#define LN2_SIG_LO BX_CONST64(0xc000000000000000)  /* 67-bit precision */
+
+#endif
+
+#define EXP_ARR_SIZE 15
+
+static float128 exp_arr[EXP_ARR_SIZE] =
+{
+    PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /*  1 */
+    PACK_FLOAT_128(0x3ffe000000000000, 0x0000000000000000), /*  2 */
+    PACK_FLOAT_128(0x3ffc555555555555, 0x5555555555555555), /*  3 */
+    PACK_FLOAT_128(0x3ffa555555555555, 0x5555555555555555), /*  4 */
+    PACK_FLOAT_128(0x3ff8111111111111, 0x1111111111111111), /*  5 */
+    PACK_FLOAT_128(0x3ff56c16c16c16c1, 0x6c16c16c16c16c17), /*  6 */
+    PACK_FLOAT_128(0x3ff2a01a01a01a01, 0xa01a01a01a01a01a), /*  7 */
+    PACK_FLOAT_128(0x3fefa01a01a01a01, 0xa01a01a01a01a01a), /*  8 */
+    PACK_FLOAT_128(0x3fec71de3a556c73, 0x38faac1c88e50017), /*  9 */
+    PACK_FLOAT_128(0x3fe927e4fb7789f5, 0xc72ef016d3ea6679), /* 10 */
+    PACK_FLOAT_128(0x3fe5ae64567f544e, 0x38fe747e4b837dc7), /* 11 */
+    PACK_FLOAT_128(0x3fe21eed8eff8d89, 0x7b544da987acfe85), /* 12 */
+    PACK_FLOAT_128(0x3fde6124613a86d0, 0x97ca38331d23af68), /* 13 */
+    PACK_FLOAT_128(0x3fda93974a8c07c9, 0xd20badf145dfa3e5), /* 14 */
+    PACK_FLOAT_128(0x3fd6ae7f3e733b81, 0xf11d8656b0ee8cb0)  /* 15 */
+};
+
+extern float128 EvalPoly(float128 x, float128 *arr, int n, struct float_status_t *status);
+
+/* required -1 < x < 1 */
+static float128 poly_exp(float128 x, struct float_status_t *status)
+{
+/*
+    //               2     3     4     5     6     7     8     9
+    //  x           x     x     x     x     x     x     x     x
+    // e - 1 ~ x + --- + --- + --- + --- + --- + --- + --- + --- + ...
+    //              2!    3!    4!    5!    6!    7!    8!    9!
+    //
+    //                     2     3     4     5     6     7     8
+    //              x     x     x     x     x     x     x     x
+    //   = x [ 1 + --- + --- + --- + --- + --- + --- + --- + --- + ... ]
+    //              2!    3!    4!    5!    6!    7!    8!    9!
+    //
+    //           8                          8
+    //          --       2k                --        2k+1
+    //   p(x) = >  C  * x           q(x) = >  C   * x
+    //          --  2k                     --  2k+1
+    //          k=0                        k=0
+    //
+    //    x
+    //   e  - 1 ~ x * [ p(x) + x * q(x) ]
+    //
+*/
+    float128 t = EvalPoly(x, exp_arr, EXP_ARR_SIZE, status);
+    return float128_mul(t, x, status);
+}
+
+// =================================================
+//                                  x
+// FX2M1                   Compute 2  - 1
+// =================================================
+
+//
+// Uses the following identities:
+//
+// 1. ----------------------------------------------------------
+//      x    x*ln(2)
+//     2  = e
+//
+// 2. ----------------------------------------------------------
+//                      2     3     4     5           n
+//      x        x     x     x     x     x           x
+//     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
+//               1!    2!    3!    4!    5!          n!
+//
+
+floatx80 f2xm1(floatx80 a, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+    Bit64u zSig0, zSig1, zSig2;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a))
+    {
+        float_raise(status, float_flag_invalid);
+        return floatx80_default_nan;
+    }
+
+    Bit64u aSig = extractFloatx80Frac(a);
+    Bit32s aExp = extractFloatx80Exp(a);
+    int aSign = extractFloatx80Sign(a);
+
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig<<1))
+            return propagateFloatx80NaNOne(a, status);
+
+        return (aSign) ? floatx80_negone : a;
+    }
+
+    if (aExp == 0) {
+        if (aSig == 0) return a;
+        float_raise(status, float_flag_denormal | float_flag_inexact);
+        normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
+
+    tiny_argument:
+        mul128By64To192(LN2_SIG_HI, LN2_SIG_LO, aSig, &zSig0, &zSig1, &zSig2);
+        if (0 < (Bit64s) zSig0) {
+            shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
+            --aExp;
+        }
+        return
+            roundAndPackFloatx80(80, aSign, aExp, zSig0, zSig1, status);
+    }
+
+    float_raise(status, float_flag_inexact);
+
+    if (aExp < 0x3FFF)
+    {
+        if (aExp < FLOATX80_EXP_BIAS-68)
+            goto tiny_argument;
+
+        /* ******************************** */
+        /* using float128 for approximation */
+        /* ******************************** */
+
+        float128 x = floatx80_to_float128(a, status);
+        x = float128_mul(x, float128_ln2, status);
+        x = poly_exp(x, status);
+        return float128_to_floatx80(x, status);
+    }
+    else
+    {
+        if (a.exp == 0xBFFF && ! (aSig<<1))
+           return floatx80_neghalf;
+
+        return a;
+    }
+}
--- a/src/cpu/softfloat/fpatan.cc
+++ b/src/cpu/softfloat/fpatan.cc
@@ -0,0 +1,288 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#define FLOAT128
+
+#include "softfloatx80.h"
+#include "softfloat-round-pack.h"
+#include "fpu_constant.h"
+
+#define FPATAN_ARR_SIZE 11
+
+static const float128 float128_one =
+        packFloat128(BX_CONST64(0x3fff000000000000), BX_CONST64(0x0000000000000000));
+static const float128 float128_sqrt3 =
+        packFloat128(BX_CONST64(0x3fffbb67ae8584ca), BX_CONST64(0xa73b25742d7078b8));
+static const floatx80 floatx80_pi  =
+        packFloatx80(0, 0x4000, BX_CONST64(0xc90fdaa22168c235));
+
+static const float128 float128_pi2 =
+        packFloat128(BX_CONST64(0x3fff921fb54442d1), BX_CONST64(0x8469898CC5170416));
+static const float128 float128_pi4 =
+        packFloat128(BX_CONST64(0x3ffe921fb54442d1), BX_CONST64(0x8469898CC5170416));
+static const float128 float128_pi6 =
+        packFloat128(BX_CONST64(0x3ffe0c152382d736), BX_CONST64(0x58465BB32E0F580F));
+
+static float128 atan_arr[FPATAN_ARR_SIZE] =
+{
+    PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /*  1 */
+    PACK_FLOAT_128(0xbffd555555555555, 0x5555555555555555), /*  3 */
+    PACK_FLOAT_128(0x3ffc999999999999, 0x999999999999999a), /*  5 */
+    PACK_FLOAT_128(0xbffc249249249249, 0x2492492492492492), /*  7 */
+    PACK_FLOAT_128(0x3ffbc71c71c71c71, 0xc71c71c71c71c71c), /*  9 */
+    PACK_FLOAT_128(0xbffb745d1745d174, 0x5d1745d1745d1746), /* 11 */
+    PACK_FLOAT_128(0x3ffb3b13b13b13b1, 0x3b13b13b13b13b14), /* 13 */
+    PACK_FLOAT_128(0xbffb111111111111, 0x1111111111111111), /* 15 */
+    PACK_FLOAT_128(0x3ffae1e1e1e1e1e1, 0xe1e1e1e1e1e1e1e2), /* 17 */
+    PACK_FLOAT_128(0xbffaaf286bca1af2, 0x86bca1af286bca1b), /* 19 */
+    PACK_FLOAT_128(0x3ffa861861861861, 0x8618618618618618)  /* 21 */
+};
+
+extern float128 OddPoly(float128 x, float128 *arr, int n, struct float_status_t *status);
+
+/* |x| < 1/4 */
+static float128 poly_atan(float128 x1, struct float_status_t *status)
+{
+/*
+    //                 3     5     7     9     11     13     15     17
+    //                x     x     x     x     x      x      x      x
+    // atan(x) ~ x - --- + --- - --- + --- - ---- + ---- - ---- + ----
+    //                3     5     7     9     11     13     15     17
+    //
+    //                 2     4     6     8     10     12     14     16
+    //                x     x     x     x     x      x      x      x
+    //   = x * [ 1 - --- + --- - --- + --- - ---- + ---- - ---- + ---- ]
+    //                3     5     7     9     11     13     15     17
+    //
+    //           5                          5
+    //          --       4k                --        4k+2
+    //   p(x) = >  C  * x           q(x) = >  C   * x
+    //          --  2k                     --  2k+1
+    //          k=0                        k=0
+    //
+    //                            2
+    //    atan(x) ~ x * [ p(x) + x * q(x) ]
+    //
+*/
+    return OddPoly(x1, atan_arr, FPATAN_ARR_SIZE, status);
+}
+
+// =================================================
+// FPATAN                  Compute y * log (x)
+//                                        2
+// =================================================
+
+//
+// Uses the following identities:
+//
+// 1. ----------------------------------------------------------
+//
+//   atan(-x) = -atan(x)
+//
+// 2. ----------------------------------------------------------
+//
+//                             x + y
+//   atan(x) + atan(y) = atan -------, xy < 1
+//                             1-xy
+//
+//                             x + y
+//   atan(x) + atan(y) = atan ------- + PI, x > 0, xy > 1
+//                             1-xy
+//
+//                             x + y
+//   atan(x) + atan(y) = atan ------- - PI, x < 0, xy > 1
+//                             1-xy
+//
+// 3. ----------------------------------------------------------
+//
+//   atan(x) = atan(INF) + atan(- 1/x)
+//
+//                           x-1
+//   atan(x) = PI/4 + atan( ----- )
+//                           x+1
+//
+//                           x * sqrt(3) - 1
+//   atan(x) = PI/6 + atan( ----------------- )
+//                             x + sqrt(3)
+//
+// 4. ----------------------------------------------------------
+//                   3     5     7     9                 2n+1
+//                  x     x     x     x              n  x
+//   atan(x) = x - --- + --- - --- + --- - ... + (-1)  ------ + ...
+//                  3     5     7     9                 2n+1
+//
+
+floatx80 fpatan(floatx80 a, floatx80 b, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
+        float_raise(status, float_flag_invalid);
+        return floatx80_default_nan;
+    }
+
+    Bit64u aSig = extractFloatx80Frac(a);
+    Bit32s aExp = extractFloatx80Exp(a);
+    int aSign = extractFloatx80Sign(a);
+    Bit64u bSig = extractFloatx80Frac(b);
+    Bit32s bExp = extractFloatx80Exp(b);
+    int bSign = extractFloatx80Sign(b);
+
+    int zSign = aSign ^ bSign;
+
+    if (bExp == 0x7FFF)
+    {
+        if ((Bit64u) (bSig<<1))
+            return propagateFloatx80NaN(a, b, status);
+
+        if (aExp == 0x7FFF) {
+            if ((Bit64u) (aSig<<1))
+                return propagateFloatx80NaN(a, b, status);
+
+            if (aSign) {   /* return 3PI/4 */
+                return roundAndPackFloatx80(80, bSign,
+                        FLOATX80_3PI4_EXP, FLOAT_3PI4_HI, FLOAT_3PI4_LO, status);
+            }
+            else {         /* return  PI/4 */
+                return roundAndPackFloatx80(80, bSign,
+                        FLOATX80_PI4_EXP, FLOAT_PI_HI, FLOAT_PI_LO, status);
+            }
+        }
+
+        if (aSig && (aExp == 0))
+            float_raise(status, float_flag_denormal);
+
+        /* return PI/2 */
+        return roundAndPackFloatx80(80, bSign, FLOATX80_PI2_EXP, FLOAT_PI_HI, FLOAT_PI_LO, status);
+    }
+    if (aExp == 0x7FFF)
+    {
+        if ((Bit64u) (aSig<<1))
+            return propagateFloatx80NaN(a, b, status);
+
+        if (bSig && (bExp == 0))
+            float_raise(status, float_flag_denormal);
+
+return_PI_or_ZERO:
+
+        if (aSign) {   /* return PI */
+            return roundAndPackFloatx80(80, bSign, FLOATX80_PI_EXP, FLOAT_PI_HI, FLOAT_PI_LO, status);
+        } else {       /* return  0 */
+            return packFloatx80(bSign, 0, 0);
+        }
+    }
+    if (bExp == 0)
+    {
+        if (bSig == 0) {
+             if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
+             goto return_PI_or_ZERO;
+        }
+
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+    if (aExp == 0)
+    {
+        if (aSig == 0)   /* return PI/2 */
+            return roundAndPackFloatx80(80, bSign, FLOATX80_PI2_EXP, FLOAT_PI_HI, FLOAT_PI_LO, status);
+
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
+    }
+
+    float_raise(status, float_flag_inexact);
+
+    /* |a| = |b| ==> return PI/4 */
+    if (aSig == bSig && aExp == bExp)
+        return roundAndPackFloatx80(80, bSign, FLOATX80_PI4_EXP, FLOAT_PI_HI, FLOAT_PI_LO, status);
+
+    /* ******************************** */
+    /* using float128 for approximation */
+    /* ******************************** */
+
+    float128 a128 = normalizeRoundAndPackFloat128(0, aExp-0x10, aSig, 0, status);
+    float128 b128 = normalizeRoundAndPackFloat128(0, bExp-0x10, bSig, 0, status);
+    float128 x;
+    int swap = 0, add_pi6 = 0, add_pi4 = 0;
+
+    if (aExp > bExp || (aExp == bExp && aSig > bSig))
+    {
+        x = float128_div(b128, a128, status);
+    }
+    else {
+        x = float128_div(a128, b128, status);
+        swap = 1;
+    }
+
+    Bit32s xExp = extractFloat128Exp(x);
+
+    if (xExp <= FLOATX80_EXP_BIAS-40)
+        goto approximation_completed;
+
+    if (x.hi >= BX_CONST64(0x3ffe800000000000))        // 3/4 < x < 1
+    {
+        /*
+        arctan(x) = arctan((x-1)/(x+1)) + pi/4
+        */
+        float128 t1 = float128_sub(x, float128_one, status);
+        float128 t2 = float128_add(x, float128_one, status);
+        x = float128_div(t1, t2, status);
+        add_pi4 = 1;
+    }
+    else
+    {
+        /* argument correction */
+        if (xExp >= 0x3FFD)                     // 1/4 < x < 3/4
+        {
+            /*
+            arctan(x) = arctan((x*sqrt(3)-1)/(x+sqrt(3))) + pi/6
+            */
+            float128 t1 = float128_mul(x, float128_sqrt3, status);
+            float128 t2 = float128_add(x, float128_sqrt3, status);
+            x = float128_sub(t1, float128_one, status);
+            x = float128_div(x, t2, status);
+            add_pi6 = 1;
+        }
+    }
+
+    x = poly_atan(x, status);
+    if (add_pi6) x = float128_add(x, float128_pi6, status);
+    if (add_pi4) x = float128_add(x, float128_pi4, status);
+
+approximation_completed:
+    if (swap) x = float128_sub(float128_pi2, x, status);
+    floatx80 result = float128_to_floatx80(x, status);
+    if (zSign) floatx80_chs(result);
+    int rSign = extractFloatx80Sign(result);
+    if (!bSign && rSign)
+        return floatx80_add(result, floatx80_pi, status);
+    if (bSign && !rSign)
+        return floatx80_sub(result, floatx80_pi, status);
+    return result;
+}
--- a/src/cpu/softfloat/fprem.cc
+++ b/src/cpu/softfloat/fprem.cc
@@ -0,0 +1,196 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#include "softfloatx80.h"
+#include "softfloat-round-pack.h"
+#define USE_estimateDiv128To64
+#include "softfloat-macros.h"
+
+/* executes single exponent reduction cycle */
+static Bit64u remainder_kernel(Bit64u aSig0, Bit64u bSig, int expDiff, Bit64u *zSig0, Bit64u *zSig1)
+{
+    Bit64u term0, term1;
+    Bit64u aSig1 = 0;
+
+    shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
+    Bit64u q = estimateDiv128To64(aSig1, aSig0, bSig);
+    mul64To128(bSig, q, &term0, &term1);
+    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
+    while ((Bit64s)(*zSig1) < 0) {
+        --q;
+        add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
+    }
+    return q;
+}
+
+static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, Bit64u *q, int rounding_mode, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    Bit32s aExp, bExp, zExp, expDiff;
+    Bit64u aSig0, aSig1, bSig;
+    int aSign;
+    *q = 0;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
+    {
+        float_raise(status, float_flag_invalid);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+
+    aSig0 = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    aSign = extractFloatx80Sign(a);
+    bSig = extractFloatx80Frac(b);
+    bExp = extractFloatx80Exp(b);
+
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig0<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1))) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        float_raise(status, float_flag_invalid);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+    if (bExp == 0x7FFF) {
+        if ((Bit64u) (bSig<<1)) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        if (aExp == 0 && aSig0) {
+            float_raise(status, float_flag_denormal);
+            normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+            *r = (a.fraction & BX_CONST64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+            return 0;
+        }
+        *r = a;
+        return 0;
+
+    }
+    if (bExp == 0) {
+        if (bSig == 0) {
+            float_raise(status, float_flag_invalid);
+            *r = floatx80_default_nan;
+            return -1;
+        }
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+    if (aExp == 0) {
+        if (aSig0 == 0) {
+            *r = a;
+            return 0;
+        }
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+    }
+    expDiff = aExp - bExp;
+    aSig1 = 0;
+
+    Bit32u overflow = 0;
+
+    if (expDiff >= 64) {
+        int n = (expDiff & 0x1f) | 0x20;
+        remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
+        zExp = aExp - n;
+        overflow = 1;
+    }
+    else {
+        zExp = bExp;
+
+        if (expDiff < 0) {
+            if (expDiff < -1) {
+               *r = (a.fraction & BX_CONST64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+               return 0;
+            }
+            shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
+            expDiff = 0;
+        }
+
+        if (expDiff > 0) {
+            *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
+        }
+        else {
+            if (bSig <= aSig0) {
+               aSig0 -= bSig;
+               *q = 1;
+            }
+        }
+
+        if (rounding_mode == float_round_nearest_even)
+        {
+            Bit64u term0, term1;
+            shift128Right(bSig, 0, 1, &term0, &term1);
+
+            if (! lt128(aSig0, aSig1, term0, term1))
+            {
+               int lt = lt128(term0, term1, aSig0, aSig1);
+               int eq = eq128(aSig0, aSig1, term0, term1);
+
+               if ((eq && ((*q) & 1)) || lt) {
+                  aSign = !aSign;
+                  ++(*q);
+               }
+               if (lt) sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
+            }
+        }
+    }
+
+    *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status);
+    return overflow;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, Bit64u *q, struct float_status_t *status)
+{
+    return do_fprem(a, b, r, q, float_round_nearest_even, status);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the extended double-precision floating-point value
+| `a' with  respect to  the corresponding value `b'. Unlike previous function
+| the  function  does not compute  the remainder  specified  in  the IEC/IEEE
+| Standard  for Binary  Floating-Point  Arithmetic.  This  function  operates
+| differently  from the  previous  function in  the way  that it  rounds  the
+| quotient of 'a' divided by 'b' to an integer.
+*----------------------------------------------------------------------------*/
+
+int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, Bit64u *q, struct float_status_t *status)
+{
+    return do_fprem(a, b, r, q, float_round_to_zero, status);
+}
--- a/src/cpu/softfloat/fpu_constant.h
+++ b/src/cpu/softfloat/fpu_constant.h
@@ -0,0 +1,82 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+#ifndef _FPU_CONSTANTS_H_
+#define _FPU_CONSTANTS_H_
+
+#include "config.h"
+
+// Pentium CPU uses only 68-bit precision M_PI approximation
+//#define BETTER_THAN_PENTIUM
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+//////////////////////////////
+// PI, PI/2, PI/4 constants
+//////////////////////////////
+
+#define FLOATX80_PI_EXP  (0x4000)
+
+// 128-bit PI fraction
+#ifdef BETTER_THAN_PENTIUM
+#define FLOAT_PI_HI (BX_CONST64(0xc90fdaa22168c234))
+#define FLOAT_PI_LO (BX_CONST64(0xc4c6628b80dc1cd1))
+#else
+#define FLOAT_PI_HI (BX_CONST64(0xc90fdaa22168c234))
+#define FLOAT_PI_LO (BX_CONST64(0xC000000000000000))
+#endif
+
+#define FLOATX80_PI2_EXP  (0x3FFF)
+#define FLOATX80_PI4_EXP  (0x3FFE)
+
+//////////////////////////////
+// 3PI/4 constant
+//////////////////////////////
+
+#define FLOATX80_3PI4_EXP (0x4000)
+
+// 128-bit 3PI/4 fraction
+#ifdef BETTER_THAN_PENTIUM
+#define FLOAT_3PI4_HI (BX_CONST64(0x96cbe3f9990e91a7))
+#define FLOAT_3PI4_LO (BX_CONST64(0x9394c9e8a0a5159c))
+#else
+#define FLOAT_3PI4_HI (BX_CONST64(0x96cbe3f9990e91a7))
+#define FLOAT_3PI4_LO (BX_CONST64(0x9000000000000000))
+#endif
+
+//////////////////////////////
+// 1/LN2 constant
+//////////////////////////////
+
+#define FLOAT_LN2INV_EXP  (0x3FFF)
+
+// 128-bit 1/LN2 fraction
+#ifdef BETTER_THAN_PENTIUM
+#define FLOAT_LN2INV_HI (BX_CONST64(0xb8aa3b295c17f0bb))
+#define FLOAT_LN2INV_LO (BX_CONST64(0xbe87fed0691d3e89))
+#else
+#define FLOAT_LN2INV_HI (BX_CONST64(0xb8aa3b295c17f0bb))
+#define FLOAT_LN2INV_LO (BX_CONST64(0xC000000000000000))
+#endif
+
+#endif
--- a/src/cpu/softfloat/fsincos.cc
+++ b/src/cpu/softfloat/fsincos.cc
@@ -0,0 +1,441 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#define FLOAT128
+
+#define USE_estimateDiv128To64
+#include "softfloatx80.h"
+#include "softfloat-round-pack.h"
+#include "fpu_constant.h"
+
+static const floatx80 floatx80_one = packFloatx80(0, 0x3fff, BX_CONST64(0x8000000000000000));
+
+/* reduce trigonometric function argument using 128-bit precision
+   M_PI approximation */
+static Bit64u argument_reduction_kernel(Bit64u aSig0, int Exp, Bit64u *zSig0, Bit64u *zSig1)
+{
+    Bit64u term0, term1, term2;
+    Bit64u aSig1 = 0;
+
+    shortShift128Left(aSig1, aSig0, Exp, &aSig1, &aSig0);
+    Bit64u q = estimateDiv128To64(aSig1, aSig0, FLOAT_PI_HI);
+    mul128By64To192(FLOAT_PI_HI, FLOAT_PI_LO, q, &term0, &term1, &term2);
+    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
+    while ((Bit64s)(*zSig1) < 0) {
+        --q;
+        add192(*zSig1, *zSig0, term2, 0, FLOAT_PI_HI, FLOAT_PI_LO, zSig1, zSig0, &term2);
+    }
+    *zSig1 = term2;
+    return q;
+}
+
+static int reduce_trig_arg(int expDiff, int *zSign, Bit64u *aSig0, Bit64u *aSig1)
+{
+    Bit64u term0, term1, q = 0;
+
+    if (expDiff < 0) {
+        shift128Right(*aSig0, 0, 1, aSig0, aSig1);
+        expDiff = 0;
+    }
+    if (expDiff > 0) {
+        q = argument_reduction_kernel(*aSig0, expDiff, aSig0, aSig1);
+    }
+    else {
+        if (FLOAT_PI_HI <= *aSig0) {
+            *aSig0 -= FLOAT_PI_HI;
+            q = 1;
+        }
+    }
+
+    shift128Right(FLOAT_PI_HI, FLOAT_PI_LO, 1, &term0, &term1);
+    if (! lt128(*aSig0, *aSig1, term0, term1))
+    {
+        int lt = lt128(term0, term1, *aSig0, *aSig1);
+        int eq = eq128(*aSig0, *aSig1, term0, term1);
+
+        if ((eq && (q & 1)) || lt) {
+            *zSign = !(*zSign);
+            ++q;
+        }
+        if (lt) sub128(FLOAT_PI_HI, FLOAT_PI_LO, *aSig0, *aSig1, aSig0, aSig1);
+    }
+
+    return (int)(q & 3);
+}
+
+#define SIN_ARR_SIZE 11
+#define COS_ARR_SIZE 11
+
+static float128 sin_arr[SIN_ARR_SIZE] =
+{
+    PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /*  1 */
+    PACK_FLOAT_128(0xbffc555555555555, 0x5555555555555555), /*  3 */
+    PACK_FLOAT_128(0x3ff8111111111111, 0x1111111111111111), /*  5 */
+    PACK_FLOAT_128(0xbff2a01a01a01a01, 0xa01a01a01a01a01a), /*  7 */
+    PACK_FLOAT_128(0x3fec71de3a556c73, 0x38faac1c88e50017), /*  9 */
+    PACK_FLOAT_128(0xbfe5ae64567f544e, 0x38fe747e4b837dc7), /* 11 */
+    PACK_FLOAT_128(0x3fde6124613a86d0, 0x97ca38331d23af68), /* 13 */
+    PACK_FLOAT_128(0xbfd6ae7f3e733b81, 0xf11d8656b0ee8cb0), /* 15 */
+    PACK_FLOAT_128(0x3fce952c77030ad4, 0xa6b2605197771b00), /* 17 */
+    PACK_FLOAT_128(0xbfc62f49b4681415, 0x724ca1ec3b7b9675), /* 19 */
+    PACK_FLOAT_128(0x3fbd71b8ef6dcf57, 0x18bef146fcee6e45)  /* 21 */
+};
+
+static float128 cos_arr[COS_ARR_SIZE] =
+{
+    PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /*  0 */
+    PACK_FLOAT_128(0xbffe000000000000, 0x0000000000000000), /*  2 */
+    PACK_FLOAT_128(0x3ffa555555555555, 0x5555555555555555), /*  4 */
+    PACK_FLOAT_128(0xbff56c16c16c16c1, 0x6c16c16c16c16c17), /*  6 */
+    PACK_FLOAT_128(0x3fefa01a01a01a01, 0xa01a01a01a01a01a), /*  8 */
+    PACK_FLOAT_128(0xbfe927e4fb7789f5, 0xc72ef016d3ea6679), /* 10 */
+    PACK_FLOAT_128(0x3fe21eed8eff8d89, 0x7b544da987acfe85), /* 12 */
+    PACK_FLOAT_128(0xbfda93974a8c07c9, 0xd20badf145dfa3e5), /* 14 */
+    PACK_FLOAT_128(0x3fd2ae7f3e733b81, 0xf11d8656b0ee8cb0), /* 16 */
+    PACK_FLOAT_128(0xbfca6827863b97d9, 0x77bb004886a2c2ab), /* 18 */
+    PACK_FLOAT_128(0x3fc1e542ba402022, 0x507a9cad2bf8f0bb)  /* 20 */
+};
+
+extern float128 OddPoly (float128 x, float128 *arr, int n, struct float_status_t *status);
+
+/* 0 <= x <= pi/4 */
+BX_CPP_INLINE float128 poly_sin(float128 x, struct float_status_t *status)
+{
+    //                 3     5     7     9     11     13     15
+    //                x     x     x     x     x      x      x
+    // sin (x) ~ x - --- + --- - --- + --- - ---- + ---- - ---- =
+    //                3!    5!    7!    9!    11!    13!    15!
+    //
+    //                 2     4     6     8     10     12     14
+    //                x     x     x     x     x      x      x
+    //   = x * [ 1 - --- + --- - --- + --- - ---- + ---- - ---- ] =
+    //                3!    5!    7!    9!    11!    13!    15!
+    //
+    //           3                          3
+    //          --       4k                --        4k+2
+    //   p(x) = >  C  * x   > 0     q(x) = >  C   * x     < 0
+    //          --  2k                     --  2k+1
+    //          k=0                        k=0
+    //
+    //                          2
+    //   sin(x) ~ x * [ p(x) + x * q(x) ]
+    //
+
+    return OddPoly(x, sin_arr, SIN_ARR_SIZE, status);
+}
+
+extern float128 EvenPoly(float128 x, float128 *arr, int n, struct float_status_t *status);
+
+/* 0 <= x <= pi/4 */
+BX_CPP_INLINE float128 poly_cos(float128 x, struct float_status_t *status)
+{
+    //                 2     4     6     8     10     12     14
+    //                x     x     x     x     x      x      x
+    // cos (x) ~ 1 - --- + --- - --- + --- - ---- + ---- - ----
+    //                2!    4!    6!    8!    10!    12!    14!
+    //
+    //           3                          3
+    //          --       4k                --        4k+2
+    //   p(x) = >  C  * x   > 0     q(x) = >  C   * x     < 0
+    //          --  2k                     --  2k+1
+    //          k=0                        k=0
+    //
+    //                      2
+    //   cos(x) ~ [ p(x) + x * q(x) ]
+    //
+
+    return EvenPoly(x, cos_arr, COS_ARR_SIZE, status);
+}
+
+BX_CPP_INLINE void sincos_invalid(floatx80 *sin_a, floatx80 *cos_a, floatx80 a)
+{
+    if (sin_a) *sin_a = a;
+    if (cos_a) *cos_a = a;
+}
+
+BX_CPP_INLINE void sincos_tiny_argument(floatx80 *sin_a, floatx80 *cos_a, floatx80 a)
+{
+    if (sin_a) *sin_a = a;
+    if (cos_a) *cos_a = floatx80_one;
+}
+
+static floatx80 sincos_approximation(int neg, float128 r, Bit64u quotient, struct float_status_t *status)
+{
+    if (quotient & 0x1) {
+        r = poly_cos(r, status);
+        neg = 0;
+    } else  {
+        r = poly_sin(r, status);
+    }
+
+    floatx80 result = float128_to_floatx80(r, status);
+    if (quotient & 0x2)
+        neg = ! neg;
+
+    if (neg)
+        floatx80_chs(result);
+
+    return result;
+}
+
+// =================================================
+// FSINCOS               Compute sin(x) and cos(x)
+// =================================================
+
+//
+// Uses the following identities:
+// ----------------------------------------------------------
+//
+//  sin(-x) = -sin(x)
+//  cos(-x) =  cos(x)
+//
+//  sin(x+y) = sin(x)*cos(y)+cos(x)*sin(y)
+//  cos(x+y) = sin(x)*sin(y)+cos(x)*cos(y)
+//
+//  sin(x+ pi/2)  =  cos(x)
+//  sin(x+ pi)    = -sin(x)
+//  sin(x+3pi/2)  = -cos(x)
+//  sin(x+2pi)    =  sin(x)
+//
+
+int fsincos(floatx80 a, floatx80 *sin_a, floatx80 *cos_a, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    Bit64u aSig0, aSig1 = 0;
+    Bit32s aExp, zExp, expDiff;
+    int aSign, zSign;
+    int q = 0;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a)) {
+        goto invalid;
+    }
+
+    aSig0 = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    aSign = extractFloatx80Sign(a);
+
+    /* invalid argument */
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig0<<1)) {
+            sincos_invalid(sin_a, cos_a, propagateFloatx80NaNOne(a, status));
+            return 0;
+        }
+
+    invalid:
+        float_raise(status, float_flag_invalid);
+        sincos_invalid(sin_a, cos_a, floatx80_default_nan);
+        return 0;
+    }
+
+    if (aExp == 0) {
+        if (aSig0 == 0) {
+            sincos_tiny_argument(sin_a, cos_a, a);
+            return 0;
+        }
+
+        float_raise(status, float_flag_denormal);
+
+        /* handle pseudo denormals */
+        if (! (aSig0 & BX_CONST64(0x8000000000000000)))
+        {
+            float_raise(status, float_flag_inexact);
+            if (sin_a)
+                float_raise(status, float_flag_underflow);
+            sincos_tiny_argument(sin_a, cos_a, a);
+            return 0;
+        }
+
+        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+    }
+
+    zSign = aSign;
+    zExp = FLOATX80_EXP_BIAS;
+    expDiff = aExp - zExp;
+
+    /* argument is out-of-range */
+    if (expDiff >= 63)
+        return -1;
+
+    float_raise(status, float_flag_inexact);
+
+    if (expDiff < -1) {    // doesn't require reduction
+        if (expDiff <= -68) {
+            a = packFloatx80(aSign, aExp, aSig0);
+            sincos_tiny_argument(sin_a, cos_a, a);
+            return 0;
+        }
+        zExp = aExp;
+    }
+    else {
+        q = reduce_trig_arg(expDiff, &zSign, &aSig0, &aSig1);
+    }
+
+    /* **************************** */
+    /* argument reduction completed */
+    /* **************************** */
+
+    /* using float128 for approximation */
+    float128 r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1, status);
+
+    if (aSign) q = -q;
+    if (sin_a) *sin_a = sincos_approximation(zSign, r,   q, status);
+    if (cos_a) *cos_a = sincos_approximation(zSign, r, q+1, status);
+
+    return 0;
+}
+
+int fsin(floatx80 *a, struct float_status_t *status)
+{
+    return fsincos(*a, a, 0, status);
+}
+
+int fcos(floatx80 *a, struct float_status_t *status)
+{
+    return fsincos(*a, 0, a, status);
+}
+
+// =================================================
+// FPTAN                 Compute tan(x)
+// =================================================
+
+//
+// Uses the following identities:
+//
+// 1. ----------------------------------------------------------
+//
+//  sin(-x) = -sin(x)
+//  cos(-x) =  cos(x)
+//
+//  sin(x+y) = sin(x)*cos(y)+cos(x)*sin(y)
+//  cos(x+y) = sin(x)*sin(y)+cos(x)*cos(y)
+//
+//  sin(x+ pi/2)  =  cos(x)
+//  sin(x+ pi)    = -sin(x)
+//  sin(x+3pi/2)  = -cos(x)
+//  sin(x+2pi)    =  sin(x)
+//
+// 2. ----------------------------------------------------------
+//
+//           sin(x)
+//  tan(x) = ------
+//           cos(x)
+//
+
+int ftan(floatx80 *a, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    Bit64u aSig0, aSig1 = 0;
+    Bit32s aExp, zExp, expDiff;
+    int aSign, zSign;
+    int q = 0;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(*a)) {
+        goto invalid;
+    }
+
+    aSig0 = extractFloatx80Frac(*a);
+    aExp = extractFloatx80Exp(*a);
+    aSign = extractFloatx80Sign(*a);
+
+    /* invalid argument */
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig0<<1))
+        {
+            *a = propagateFloatx80NaNOne(*a, status);
+            return 0;
+        }
+
+    invalid:
+        float_raise(status, float_flag_invalid);
+        *a = floatx80_default_nan;
+        return 0;
+    }
+
+    if (aExp == 0) {
+        if (aSig0 == 0) return 0;
+        float_raise(status, float_flag_denormal);
+        /* handle pseudo denormals */
+        if (! (aSig0 & BX_CONST64(0x8000000000000000)))
+        {
+            float_raise(status, float_flag_inexact | float_flag_underflow);
+            return 0;
+        }
+        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+    }
+
+    zSign = aSign;
+    zExp = FLOATX80_EXP_BIAS;
+    expDiff = aExp - zExp;
+
+    /* argument is out-of-range */
+    if (expDiff >= 63)
+        return -1;
+
+    float_raise(status, float_flag_inexact);
+
+    if (expDiff < -1) {    // doesn't require reduction
+        if (expDiff <= -68) {
+            *a = packFloatx80(aSign, aExp, aSig0);
+            return 0;
+        }
+        zExp = aExp;
+    }
+    else {
+        q = reduce_trig_arg(expDiff, &zSign, &aSig0, &aSig1);
+    }
+
+    /* **************************** */
+    /* argument reduction completed */
+    /* **************************** */
+
+    /* using float128 for approximation */
+    float128 r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1, status);
+
+    float128 sin_r = poly_sin(r, status);
+    float128 cos_r = poly_cos(r, status);
+
+    if (q & 0x1) {
+        r = float128_div(cos_r, sin_r, status);
+        zSign = ! zSign;
+    } else {
+        r = float128_div(sin_r, cos_r, status);
+    }
+
+    *a = float128_to_floatx80(r, status);
+    if (zSign)
+        floatx80_chs(*a);
+
+    return 0;
+}
--- a/src/cpu/softfloat/fyl2x.cc
+++ b/src/cpu/softfloat/fyl2x.cc
@@ -0,0 +1,363 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#define FLOAT128
+
+#include "softfloatx80.h"
+#include "softfloat-round-pack.h"
+#include "fpu_constant.h"
+
+static const floatx80 floatx80_one =
+    packFloatx80(0, 0x3fff, BX_CONST64(0x8000000000000000));
+
+static const float128 float128_one =
+    packFloat128(BX_CONST64(0x3fff000000000000), BX_CONST64(0x0000000000000000));
+static const float128 float128_two =
+    packFloat128(BX_CONST64(0x4000000000000000), BX_CONST64(0x0000000000000000));
+
+static const float128 float128_ln2inv2 =
+    packFloat128(BX_CONST64(0x400071547652b82f), BX_CONST64(0xe1777d0ffda0d23a));
+
+#define SQRT2_HALF_SIG 	BX_CONST64(0xb504f333f9de6484)
+
+extern float128 OddPoly(float128 x, float128 *arr, int n, struct float_status_t *status);
+
+#define L2_ARR_SIZE 9
+
+static float128 ln_arr[L2_ARR_SIZE] =
+{
+    PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /*  1 */
+    PACK_FLOAT_128(0x3ffd555555555555, 0x5555555555555555), /*  3 */
+    PACK_FLOAT_128(0x3ffc999999999999, 0x999999999999999a), /*  5 */
+    PACK_FLOAT_128(0x3ffc249249249249, 0x2492492492492492), /*  7 */
+    PACK_FLOAT_128(0x3ffbc71c71c71c71, 0xc71c71c71c71c71c), /*  9 */
+    PACK_FLOAT_128(0x3ffb745d1745d174, 0x5d1745d1745d1746), /* 11 */
+    PACK_FLOAT_128(0x3ffb3b13b13b13b1, 0x3b13b13b13b13b14), /* 13 */
+    PACK_FLOAT_128(0x3ffb111111111111, 0x1111111111111111), /* 15 */
+    PACK_FLOAT_128(0x3ffae1e1e1e1e1e1, 0xe1e1e1e1e1e1e1e2)  /* 17 */
+};
+
+static float128 poly_ln(float128 x1, struct float_status_t *status)
+{
+/*
+    //
+    //                     3     5     7     9     11     13     15
+    //        1+u         u     u     u     u     u      u      u
+    // 1/2 ln ---  ~ u + --- + --- + --- + --- + ---- + ---- + ---- =
+    //        1-u         3     5     7     9     11     13     15
+    //
+    //                     2     4     6     8     10     12     14
+    //                    u     u     u     u     u      u      u
+    //       = u * [ 1 + --- + --- + --- + --- + ---- + ---- + ---- ] =
+    //                    3     5     7     9     11     13     15
+    //
+    //           3                          3
+    //          --       4k                --        4k+2
+    //   p(u) = >  C  * u           q(u) = >  C   * u
+    //          --  2k                     --  2k+1
+    //          k=0                        k=0
+    //
+    //          1+u                 2
+    //   1/2 ln --- ~ u * [ p(u) + u * q(u) ]
+    //          1-u
+    //
+*/
+    return OddPoly(x1, ln_arr, L2_ARR_SIZE, status);
+}
+
+/* required sqrt(2)/2 < x < sqrt(2) */
+static float128 poly_l2(float128 x, struct float_status_t *status)
+{
+    /* using float128 for approximation */
+    float128 x_p1 = float128_add(x, float128_one, status);
+    float128 x_m1 = float128_sub(x, float128_one, status);
+    x = float128_div(x_m1, x_p1, status);
+    x = poly_ln(x, status);
+    x = float128_mul(x, float128_ln2inv2, status);
+    return x;
+}
+
+static float128 poly_l2p1(float128 x, struct float_status_t *status)
+{
+    /* using float128 for approximation */
+    float128 x_p2 = float128_add(x, float128_two, status);
+    x = float128_div(x, x_p2, status);
+    x = poly_ln(x, status);
+    x = float128_mul(x, float128_ln2inv2, status);
+    return x;
+}
+
+// =================================================
+// FYL2X                   Compute y * log (x)
+//                                        2
+// =================================================
+
+//
+// Uses the following identities:
+//
+// 1. ----------------------------------------------------------
+//              ln(x)
+//   log (x) = -------,  ln (x*y) = ln(x) + ln(y)
+//      2       ln(2)
+//
+// 2. ----------------------------------------------------------
+//                1+u             x-1
+//   ln (x) = ln -----, when u = -----
+//                1-u             x+1
+//
+// 3. ----------------------------------------------------------
+//                        3     5     7           2n+1
+//       1+u             u     u     u           u
+//   ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ]
+//       1-u             3     5     7           2n+1
+//
+
+floatx80 fyl2x(floatx80 a, floatx80 b, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
+invalid:
+        float_raise(status, float_flag_invalid);
+        return floatx80_default_nan;
+    }
+
+    Bit64u aSig = extractFloatx80Frac(a);
+    Bit32s aExp = extractFloatx80Exp(a);
+    int aSign = extractFloatx80Sign(a);
+    Bit64u bSig = extractFloatx80Frac(b);
+    Bit32s bExp = extractFloatx80Exp(b);
+    int bSign = extractFloatx80Sign(b);
+
+    int zSign = bSign ^ 1;
+
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig<<1)
+             || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
+        {
+            return propagateFloatx80NaN(a, b, status);
+        }
+        if (aSign) goto invalid;
+        else {
+            if (bExp == 0) {
+                if (bSig == 0) goto invalid;
+                float_raise(status, float_flag_denormal);
+            }
+            return packFloatx80(bSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+        }
+    }
+    if (bExp == 0x7FFF)
+    {
+        if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
+        if (aSign && (Bit64u)(aExp | aSig)) goto invalid;
+        if (aSig && (aExp == 0))
+            float_raise(status, float_flag_denormal);
+        if (aExp < 0x3FFF) {
+            return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+        }
+        if (aExp == 0x3FFF && ((Bit64u) (aSig<<1) == 0)) goto invalid;
+        return packFloatx80(bSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+    }
+    if (aExp == 0) {
+        if (aSig == 0) {
+            if ((bExp | bSig) == 0) goto invalid;
+            float_raise(status, float_flag_divbyzero);
+            return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+        }
+        if (aSign) goto invalid;
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
+    }
+    if (aSign) goto invalid;
+    if (bExp == 0) {
+        if (bSig == 0) {
+            if (aExp < 0x3FFF) return packFloatx80(zSign, 0, 0);
+            return packFloatx80(bSign, 0, 0);
+        }
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+    if (aExp == 0x3FFF && ((Bit64u) (aSig<<1) == 0))
+        return packFloatx80(bSign, 0, 0);
+
+    float_raise(status, float_flag_inexact);
+
+    int ExpDiff = aExp - 0x3FFF;
+    aExp = 0;
+    if (aSig >= SQRT2_HALF_SIG) {
+        ExpDiff++;
+        aExp--;
+    }
+
+    /* ******************************** */
+    /* using float128 for approximation */
+    /* ******************************** */
+
+    Bit64u zSig0, zSig1;
+    shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
+    float128 x = packFloat128Four(0, aExp+0x3FFF, zSig0, zSig1);
+    x = poly_l2(x, status);
+    x = float128_add(x, int64_to_float128((Bit64s) ExpDiff), status);
+    return floatx80_128_mul(b, x, status);
+}
+
+// =================================================
+// FYL2XP1                 Compute y * log (x + 1)
+//                                        2
+// =================================================
+
+//
+// Uses the following identities:
+//
+// 1. ----------------------------------------------------------
+//              ln(x)
+//   log (x) = -------
+//      2       ln(2)
+//
+// 2. ----------------------------------------------------------
+//                  1+u              x
+//   ln (x+1) = ln -----, when u = -----
+//                  1-u             x+2
+//
+// 3. ----------------------------------------------------------
+//                        3     5     7           2n+1
+//       1+u             u     u     u           u
+//   ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ]
+//       1-u             3     5     7           2n+1
+//
+
+floatx80 fyl2xp1(floatx80 a, floatx80 b, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    Bit32s aExp, bExp;
+    Bit64u aSig, bSig, zSig0, zSig1, zSig2;
+    int aSign, bSign;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
+invalid:
+        float_raise(status, float_flag_invalid);
+        return floatx80_default_nan;
+    }
+
+    aSig = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    aSign = extractFloatx80Sign(a);
+    bSig = extractFloatx80Frac(b);
+    bExp = extractFloatx80Exp(b);
+    bSign = extractFloatx80Sign(b);
+    int zSign = aSign ^ bSign;
+
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig<<1)
+             || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
+        {
+            return propagateFloatx80NaN(a, b, status);
+        }
+        if (aSign) goto invalid;
+        else {
+            if (bExp == 0) {
+                if (bSig == 0) goto invalid;
+                float_raise(status, float_flag_denormal);
+            }
+            return packFloatx80(bSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+        }
+    }
+    if (bExp == 0x7FFF)
+    {
+        if ((Bit64u) (bSig<<1))
+            return propagateFloatx80NaN(a, b, status);
+
+        if (aExp == 0) {
+            if (aSig == 0) goto invalid;
+            float_raise(status, float_flag_denormal);
+        }
+
+        return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+    }
+    if (aExp == 0) {
+        if (aSig == 0) {
+            if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
+            return packFloatx80(zSign, 0, 0);
+        }
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
+    }
+    if (bExp == 0) {
+        if (bSig == 0) return packFloatx80(zSign, 0, 0);
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+
+    float_raise(status, float_flag_inexact);
+
+    if (aSign && aExp >= 0x3FFF)
+        return a;
+
+    if (aExp >= 0x3FFC) // big argument
+    {
+        return fyl2x(floatx80_add(a, floatx80_one, status), b, status);
+    }
+
+    // handle tiny argument
+    if (aExp < FLOATX80_EXP_BIAS-70)
+    {
+        // first order approximation, return (a*b)/ln(2)
+        Bit32s zExp = aExp + FLOAT_LN2INV_EXP - 0x3FFE;
+
+	mul128By64To192(FLOAT_LN2INV_HI, FLOAT_LN2INV_LO, aSig, &zSig0, &zSig1, &zSig2);
+        if (0 < (Bit64s) zSig0) {
+            shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
+            --zExp;
+        }
+
+        zExp = zExp + bExp - 0x3FFE;
+	mul128By64To192(zSig0, zSig1, bSig, &zSig0, &zSig1, &zSig2);
+        if (0 < (Bit64s) zSig0) {
+            shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
+            --zExp;
+        }
+
+        return
+            roundAndPackFloatx80(80, aSign ^ bSign, zExp, zSig0, zSig1, status);
+    }
+
+    /* ******************************** */
+    /* using float128 for approximation */
+    /* ******************************** */
+
+    shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
+    float128 x = packFloat128Four(aSign, aExp, zSig0, zSig1);
+    x = poly_l2p1(x, status);
+    return floatx80_128_mul(b, x, status);
+}
--- a/src/cpu/softfloat/poly.cc
+++ b/src/cpu/softfloat/poly.cc
@@ -0,0 +1,89 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#define FLOAT128
+
+#include <assert.h>
+#include "softfloat.h"
+
+//                            2         3         4               n
+// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
+//         0    1         2         3         4               n
+//
+//          --       2k                --        2k+1
+//   p(x) = >  C  * x           q(x) = >  C   * x
+//          --  2k                     --  2k+1
+//
+//   f(x) ~ [ p(x) + x * q(x) ]
+//
+
+float128 EvalPoly(float128 x, float128 *arr, int n, struct float_status_t *status)
+{
+    float128 r = arr[--n];
+
+    do {
+        r = float128_mul(r, x, status);
+        r = float128_add(r, arr[--n], status);
+    } while (n > 0);
+
+    return r;
+}
+
+//                  2         4         6         8               2n
+// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
+//         0    1         2         3         4               n
+//
+//          --       4k                --        4k+2
+//   p(x) = >  C  * x           q(x) = >  C   * x
+//          --  2k                     --  2k+1
+//
+//                    2
+//   f(x) ~ [ p(x) + x * q(x) ]
+//
+
+float128 EvenPoly(float128 x, float128 *arr, int n, struct float_status_t *status)
+{
+     return EvalPoly(float128_mul(x, x, status), arr, n, status);
+}
+
+//                        3         5         7         9               2n+1
+// f(x) ~ (C * x) + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
+//          0         1         2         3         4               n
+//                        2         4         6         8               2n
+//      = x * [ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
+//               0    1         2         3         4               n
+//
+//          --       4k                --        4k+2
+//   p(x) = >  C  * x           q(x) = >  C   * x
+//          --  2k                     --  2k+1
+//
+//                        2
+//   f(x) ~ x * [ p(x) + x * q(x) ]
+//
+
+float128 OddPoly(float128 x, float128 *arr, int n, struct float_status_t *status)
+{
+     return float128_mul(x, EvenPoly(x, arr, n, status), status);
+}
--- a/src/cpu/softfloat/softfloat-compare.h
+++ b/src/cpu/softfloat/softfloat-compare.h
@@ -0,0 +1,496 @@
+/*============================================================================
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#ifndef _SOFTFLOAT_COMPARE_H_
+#define _SOFTFLOAT_COMPARE_H_
+
+#include "softfloat.h"
+
+// ======= float32 ======= //
+
+typedef int (*float32_compare_method)(float32, float32, struct float_status_t *status);
+
+// 0x00
+BX_CPP_INLINE int float32_eq_ordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_equal);
+}
+
+// 0x01
+BX_CPP_INLINE int float32_lt_ordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_less);
+}
+
+// 0x02
+BX_CPP_INLINE int float32_le_ordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_equal);
+}
+
+// 0x03
+BX_CPP_INLINE int float32_unordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_unordered);
+}
+
+// 0x04
+BX_CPP_INLINE int float32_neq_unordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation != float_relation_equal);
+}
+
+// 0x05
+BX_CPP_INLINE int float32_nlt_unordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation != float_relation_less);
+}
+
+// 0x06
+BX_CPP_INLINE int float32_nle_unordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation != float_relation_less) && (relation != float_relation_equal);
+}
+
+// 0x07
+BX_CPP_INLINE int float32_ordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation != float_relation_unordered);
+}
+
+// 0x08
+BX_CPP_INLINE int float32_eq_unordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_equal) || (relation == float_relation_unordered);
+}
+
+// 0x09
+BX_CPP_INLINE int float32_nge_unordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_unordered);
+}
+
+// 0x0a
+BX_CPP_INLINE int float32_ngt_unordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation != float_relation_greater);
+}
+
+// 0x0b
+BX_CPP_INLINE int float32_false_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   float32_compare_quiet(a, b, status);
+   return 0;
+}
+
+// 0x0c
+BX_CPP_INLINE int float32_neq_ordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation != float_relation_equal) && (relation != float_relation_unordered);
+}
+
+// 0x0d
+BX_CPP_INLINE int float32_ge_ordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_greater) || (relation == float_relation_equal);
+}
+
+// 0x0e
+BX_CPP_INLINE int float32_gt_ordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_greater);
+}
+
+// 0x0f
+BX_CPP_INLINE int float32_true_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   float32_compare_quiet(a, b, status);
+   return 1;
+}
+
+// 0x10
+BX_CPP_INLINE int float32_eq_ordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_equal);
+}
+
+// 0x11
+BX_CPP_INLINE int float32_lt_ordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_less);
+}
+
+// 0x12
+BX_CPP_INLINE int float32_le_ordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_equal);
+}
+
+// 0x13
+BX_CPP_INLINE int float32_unordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_unordered);
+}
+
+// 0x14
+BX_CPP_INLINE int float32_neq_unordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation != float_relation_equal);
+}
+
+// 0x15
+BX_CPP_INLINE int float32_nlt_unordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation != float_relation_less);
+}
+
+// 0x16
+BX_CPP_INLINE int float32_nle_unordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation != float_relation_less) && (relation != float_relation_equal);
+}
+
+// 0x17
+BX_CPP_INLINE int float32_ordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation != float_relation_unordered);
+}
+
+// 0x18
+BX_CPP_INLINE int float32_eq_unordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation == float_relation_equal) || (relation == float_relation_unordered);
+}
+
+// 0x19
+BX_CPP_INLINE int float32_nge_unordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_unordered);
+}
+
+// 0x1a
+BX_CPP_INLINE int float32_ngt_unordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation != float_relation_greater);
+}
+
+// 0x1b
+BX_CPP_INLINE int float32_false_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   float32_compare_two(a, b, status);
+   return 0;
+}
+
+// 0x1c
+BX_CPP_INLINE int float32_neq_ordered_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_two(a, b, status);
+   return (relation != float_relation_equal) && (relation != float_relation_unordered);
+}
+
+// 0x1d
+BX_CPP_INLINE int float32_ge_ordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_greater) || (relation == float_relation_equal);
+}
+
+// 0x1e
+BX_CPP_INLINE int float32_gt_ordered_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+   int relation = float32_compare_quiet(a, b, status);
+   return (relation == float_relation_greater);
+}
+
+// 0x1f
+BX_CPP_INLINE int float32_true_signalling(float32 a, float32 b, struct float_status_t *status)
+{
+   float32_compare_two(a, b, status);
+   return 1;
+}
+
+// ======= float64 ======= //
+
+typedef int (*float64_compare_method)(float64, float64, struct float_status_t *status);
+
+// 0x00
+BX_CPP_INLINE int float64_eq_ordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_equal);
+}
+
+// 0x01
+BX_CPP_INLINE int float64_lt_ordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_less);
+}
+
+// 0x02
+BX_CPP_INLINE int float64_le_ordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_equal);
+}
+
+// 0x03
+BX_CPP_INLINE int float64_unordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_unordered);
+}
+
+// 0x04
+BX_CPP_INLINE int float64_neq_unordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation != float_relation_equal);
+}
+
+// 0x05
+BX_CPP_INLINE int float64_nlt_unordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation != float_relation_less);
+}
+
+// 0x06
+BX_CPP_INLINE int float64_nle_unordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation != float_relation_less) && (relation != float_relation_equal);
+}
+
+// 0x07
+BX_CPP_INLINE int float64_ordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation != float_relation_unordered);
+}
+
+// 0x08
+BX_CPP_INLINE int float64_eq_unordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_equal) || (relation == float_relation_unordered);
+}
+
+// 0x09
+BX_CPP_INLINE int float64_nge_unordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_unordered);
+}
+
+// 0x0a
+BX_CPP_INLINE int float64_ngt_unordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation != float_relation_greater);
+}
+
+// 0x0b
+BX_CPP_INLINE int float64_false_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   float64_compare_quiet(a, b, status);
+   return 0;
+}
+
+// 0x0c
+BX_CPP_INLINE int float64_neq_ordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation != float_relation_equal) && (relation != float_relation_unordered);
+}
+
+// 0x0d
+BX_CPP_INLINE int float64_ge_ordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_greater) || (relation == float_relation_equal);
+}
+
+// 0x0e
+BX_CPP_INLINE int float64_gt_ordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_greater);
+}
+
+// 0x0f
+BX_CPP_INLINE int float64_true_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   float64_compare_quiet(a, b, status);
+   return 1;
+}
+
+// 0x10
+BX_CPP_INLINE int float64_eq_ordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_equal);
+}
+
+// 0x11
+BX_CPP_INLINE int float64_lt_ordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_less);
+}
+
+// 0x12
+BX_CPP_INLINE int float64_le_ordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_equal);
+}
+
+// 0x13
+BX_CPP_INLINE int float64_unordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_unordered);
+}
+
+// 0x14
+BX_CPP_INLINE int float64_neq_unordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation != float_relation_equal);
+}
+
+// 0x15
+BX_CPP_INLINE int float64_nlt_unordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation != float_relation_less);
+}
+
+// 0x16
+BX_CPP_INLINE int float64_nle_unordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation != float_relation_less) && (relation != float_relation_equal);
+}
+
+// 0x17
+BX_CPP_INLINE int float64_ordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation != float_relation_unordered);
+}
+
+// 0x18
+BX_CPP_INLINE int float64_eq_unordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation == float_relation_equal) || (relation == float_relation_unordered);
+}
+
+// 0x19
+BX_CPP_INLINE int float64_nge_unordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_less) || (relation == float_relation_unordered);
+}
+
+// 0x1a
+BX_CPP_INLINE int float64_ngt_unordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation != float_relation_greater);
+}
+
+// 0x1b
+BX_CPP_INLINE int float64_false_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   float64_compare_two(a, b, status);
+   return 0;
+}
+
+// 0x1c
+BX_CPP_INLINE int float64_neq_ordered_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_two(a, b, status);
+   return (relation != float_relation_equal) && (relation != float_relation_unordered);
+}
+
+// 0x1d
+BX_CPP_INLINE int float64_ge_ordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_greater) || (relation == float_relation_equal);
+}
+
+// 0x1e
+BX_CPP_INLINE int float64_gt_ordered_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+   int relation = float64_compare_quiet(a, b, status);
+   return (relation == float_relation_greater);
+}
+
+// 0x1f
+BX_CPP_INLINE int float64_true_signalling(float64 a, float64 b, struct float_status_t *status)
+{
+   float64_compare_two(a, b, status);
+   return 1;
+}
+
+#endif
--- a/src/cpu/softfloat/softfloat-macros.h
+++ b/src/cpu/softfloat/softfloat-macros.h
@@ -0,0 +1,686 @@
+/*============================================================================
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#ifndef _SOFTFLOAT_MACROS_H_
+#define _SOFTFLOAT_MACROS_H_
+
+/*----------------------------------------------------------------------------
+| Shifts `a' right by the number of bits given in `count'.  If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1.  The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 16, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16u shift16RightJamming(Bit16u a, int count)
+{
+    Bit16u z;
+
+    if (count == 0) {
+        z = a;
+    }
+    else if (count < 16) {
+        z = (a>>count) | ((a<<((-count) & 15)) != 0);
+    }
+    else {
+        z = (a != 0);
+    }
+
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Shifts `a' right by the number of bits given in `count'.  If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1.  The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 32, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit32u shift32RightJamming(Bit32u a, int count)
+{
+    Bit32u z;
+
+    if (count == 0) {
+        z = a;
+    }
+    else if (count < 32) {
+        z = (a>>count) | ((a<<((-count) & 31)) != 0);
+    }
+    else {
+        z = (a != 0);
+    }
+
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Shifts `a' right by the number of bits given in `count'.  If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1.  The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 64, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit64u shift64RightJamming(Bit64u a, int count)
+{
+    Bit64u z;
+
+    if (count == 0) {
+        z = a;
+    }
+    else if (count < 64) {
+        z = (a>>count) | ((a << ((-count) & 63)) != 0);
+    }
+    else {
+        z = (a != 0);
+    }
+
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
+| _plus_ the number of bits given in `count'.  The shifted result is at most
+| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
+| bits shifted off form a second 64-bit result as follows:  The _last_ bit
+| shifted off is the most-significant bit of the extra result, and the other
+| 63 bits of the extra result are all zero if and only if _all_but_the_last_
+| bits shifted off were all zero.  This extra result is stored in the location
+| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
+|     (This routine makes more sense if `a0' and `a1' are considered to form
+| a fixed-point value with binary point between `a0' and `a1'.  This fixed-
+| point value is shifted right by the number of bits given in `count', and
+| the integer part of the result is returned at the location pointed to by
+| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
+| described above, and is returned at the location pointed to by `z1Ptr'.)
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void shift64ExtraRightJamming(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+{
+    Bit64u z0, z1;
+    int negCount = (-count) & 63;
+
+    if (count == 0) {
+        z1 = a1;
+        z0 = a0;
+    }
+    else if (count < 64) {
+        z1 = (a0<<negCount) | (a1 != 0);
+        z0 = a0>>count;
+    }
+    else {
+        if (count == 64) {
+            z1 = a0 | (a1 != 0);
+        }
+        else {
+            z1 = ((a0 | a1) != 0);
+        }
+        z0 = 0;
+    }
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+/*----------------------------------------------------------------------------
+| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
+| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
+| any carry out is lost.  The result is broken into two 64-bit pieces which
+| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void add128(Bit64u a0, Bit64u a1, Bit64u b0, Bit64u b1, Bit64u *z0Ptr, Bit64u *z1Ptr)
+{
+    Bit64u z1 = a1 + b1;
+    *z1Ptr = z1;
+    *z0Ptr = a0 + b0 + (z1 < a1);
+}
+
+/*----------------------------------------------------------------------------
+| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
+| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
+| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
+| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
+| `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void
+ sub128(Bit64u a0, Bit64u a1, Bit64u b0, Bit64u b1, Bit64u *z0Ptr, Bit64u *z1Ptr)
+{
+    *z1Ptr = a1 - b1;
+    *z0Ptr = a0 - b0 - (a1 < b1);
+}
+
+/*----------------------------------------------------------------------------
+| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
+| into two 64-bit pieces which are stored at the locations pointed to by
+| `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void mul64To128(Bit64u a, Bit64u b, Bit64u *z0Ptr, Bit64u *z1Ptr)
+{
+    Bit32u aHigh, aLow, bHigh, bLow;
+    Bit64u z0, zMiddleA, zMiddleB, z1;
+
+    aLow = (Bit32u) a;
+    aHigh = (Bit32u)(a>>32);
+    bLow = (Bit32u) b;
+    bHigh = (Bit32u)(b>>32);
+    z1 = ((Bit64u) aLow) * bLow;
+    zMiddleA = ((Bit64u) aLow) * bHigh;
+    zMiddleB = ((Bit64u) aHigh) * bLow;
+    z0 = ((Bit64u) aHigh) * bHigh;
+    zMiddleA += zMiddleB;
+    z0 += (((Bit64u) (zMiddleA < zMiddleB))<<32) + (zMiddleA>>32);
+    zMiddleA <<= 32;
+    z1 += zMiddleA;
+    z0 += (z1 < zMiddleA);
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+/*----------------------------------------------------------------------------
+| Returns an approximation to the 64-bit integer quotient obtained by dividing
+| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
+| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
+| toward zero, the approximation returned lies between q and q + 2 inclusive.
+| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
+| unsigned integer is returned.
+*----------------------------------------------------------------------------*/
+
+#ifdef USE_estimateDiv128To64
+static Bit64u estimateDiv128To64(Bit64u a0, Bit64u a1, Bit64u b)
+{
+    Bit64u b0, b1;
+    Bit64u rem0, rem1, term0, term1;
+    Bit64u z;
+
+    if (b <= a0) return BX_CONST64(0xFFFFFFFFFFFFFFFF);
+    b0 = b>>32;
+    z = (b0<<32 <= a0) ? BX_CONST64(0xFFFFFFFF00000000) : (a0 / b0)<<32;
+    mul64To128(b, z, &term0, &term1);
+    sub128(a0, a1, term0, term1, &rem0, &rem1);
+    while (((Bit64s) rem0) < 0) {
+        z -= BX_CONST64(0x100000000);
+        b1 = b<<32;
+        add128(rem0, rem1, b0, b1, &rem0, &rem1);
+    }
+    rem0 = (rem0<<32) | (rem1>>32);
+    z |= (b0<<32 <= rem0) ? 0xFFFFFFFF : rem0 / b0;
+    return z;
+}
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns an approximation to the square root of the 32-bit significand given
+| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
+| `aExp' (the least significant bit) is 1, the integer returned approximates
+| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
+| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
+| case, the approximation returned lies strictly within +/-2 of the exact
+| value.
+*----------------------------------------------------------------------------*/
+
+#ifdef USE_estimateSqrt32
+static Bit32u estimateSqrt32(Bit16s aExp, Bit32u a)
+{
+    static const Bit16u sqrtOddAdjustments[] = {
+        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
+        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
+    };
+    static const Bit16u sqrtEvenAdjustments[] = {
+        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
+        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
+    };
+    Bit32u z;
+
+    int index = (a>>27) & 15;
+    if (aExp & 1) {
+        z = 0x4000 + (a>>17) - sqrtOddAdjustments[index];
+        z = ((a / z)<<14) + (z<<15);
+        a >>= 1;
+    }
+    else {
+        z = 0x8000 + (a>>17) - sqrtEvenAdjustments[index];
+        z = a / z + z;
+        z = (0x20000 <= z) ? 0xFFFF8000 : (z<<15);
+        if (z <= a) return (Bit32u) (((Bit32s) a)>>1);
+    }
+    return ((Bit32u) ((((Bit64u) a)<<31) / z)) + (z>>1);
+}
+#endif
+
+static const int countLeadingZeros8[] = {
+  8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'.  If `a' is zero, 16 is returned.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int countLeadingZeros16(Bit16u a)
+{
+    int shiftCount = 0;
+    if (a < 0x100) {
+        shiftCount += 8;
+        a <<= 8;
+    }
+    shiftCount += countLeadingZeros8[a>>8];
+    return shiftCount;
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'.  If `a' is zero, 32 is returned.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int countLeadingZeros32(Bit32u a)
+{
+    int shiftCount = 0;
+    if (a < 0x10000) {
+        shiftCount += 16;
+        a <<= 16;
+    }
+    if (a < 0x1000000) {
+        shiftCount += 8;
+        a <<= 8;
+    }
+    shiftCount += countLeadingZeros8[a>>24];
+    return shiftCount;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'.  If `a' is zero, 64 is returned.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int countLeadingZeros64(Bit64u a)
+{
+    int shiftCount = 0;
+    if (a < BX_CONST64(0x100000000)) {
+        shiftCount += 32;
+    }
+    else {
+        a >>= 32;
+    }
+    shiftCount += countLeadingZeros32((Bit32u)(a));
+    return shiftCount;
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
+| number of bits given in `count'.  Any bits shifted off are lost.  The value
+| of `count' can be arbitrarily large; in particular, if `count' is greater
+| than 128, the result will be 0.  The result is broken into two 64-bit pieces
+| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void shift128Right(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+{
+    Bit64u z0, z1;
+    int negCount = (-count) & 63;
+
+    if (count == 0) {
+        z1 = a1;
+        z0 = a0;
+    }
+    else if (count < 64) {
+        z1 = (a0<<negCount) | (a1>>count);
+        z0 = a0>>count;
+    }
+    else {
+        z1 = (count < 128) ? (a0>>(count & 63)) : 0;
+        z0 = 0;
+    }
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
+| number of bits given in `count'.  If any nonzero bits are shifted off, they
+| are ``jammed'' into the least significant bit of the result by setting the
+| least significant bit to 1.  The value of `count' can be arbitrarily large;
+| in particular, if `count' is greater than 128, the result will be either
+| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
+| nonzero.  The result is broken into two 64-bit pieces which are stored at
+| the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void shift128RightJamming(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+{
+    Bit64u z0, z1;
+    int negCount = (-count) & 63;
+
+    if (count == 0) {
+        z1 = a1;
+        z0 = a0;
+    }
+    else if (count < 64) {
+        z1 = (a0<<negCount) | (a1>>count) | ((a1<<negCount) != 0);
+        z0 = a0>>count;
+    }
+    else {
+        if (count == 64) {
+            z1 = a0 | (a1 != 0);
+        }
+        else if (count < 128) {
+            z1 = (a0>>(count & 63)) | (((a0<<negCount) | a1) != 0);
+        }
+        else {
+            z1 = ((a0 | a1) != 0);
+        }
+        z0 = 0;
+    }
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+/*----------------------------------------------------------------------------
+| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
+| number of bits given in `count'.  Any bits shifted off are lost.  The value
+| of `count' must be less than 64.  The result is broken into two 64-bit
+| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void shortShift128Left(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+{
+    *z1Ptr = a1<<count;
+    *z0Ptr = (count == 0) ? a0 : (a0<<count) | (a1>>((-count) & 63));
+}
+
+/*----------------------------------------------------------------------------
+| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
+| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
+| modulo 2^192, so any carry out is lost.  The result is broken into three
+| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
+| `z1Ptr', and `z2Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void add192(
+     Bit64u a0,
+     Bit64u a1,
+     Bit64u a2,
+     Bit64u b0,
+     Bit64u b1,
+     Bit64u b2,
+     Bit64u *z0Ptr,
+     Bit64u *z1Ptr,
+     Bit64u *z2Ptr
+)
+{
+    Bit64u z0, z1, z2;
+    unsigned carry0, carry1;
+
+    z2 = a2 + b2;
+    carry1 = (z2 < a2);
+    z1 = a1 + b1;
+    carry0 = (z1 < a1);
+    z0 = a0 + b0;
+    z1 += carry1;
+    z0 += (z1 < carry1);
+    z0 += carry0;
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+/*----------------------------------------------------------------------------
+| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
+| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
+| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
+| result is broken into three 64-bit pieces which are stored at the locations
+| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void sub192(
+     Bit64u a0,
+     Bit64u a1,
+     Bit64u a2,
+     Bit64u b0,
+     Bit64u b1,
+     Bit64u b2,
+     Bit64u *z0Ptr,
+     Bit64u *z1Ptr,
+     Bit64u *z2Ptr
+)
+{
+    Bit64u z0, z1, z2;
+    unsigned borrow0, borrow1;
+
+    z2 = a2 - b2;
+    borrow1 = (a2 < b2);
+    z1 = a1 - b1;
+    borrow0 = (a1 < b1);
+    z0 = a0 - b0;
+    z0 -= (z1 < borrow1);
+    z1 -= borrow1;
+    z0 -= borrow0;
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
+| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
+| Otherwise, returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int eq128(Bit64u a0, Bit64u a1, Bit64u b0, Bit64u b1)
+{
+    return (a0 == b0) && (a1 == b1);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
+| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
+| Otherwise, returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int le128(Bit64u a0, Bit64u a1, Bit64u b0, Bit64u b1)
+{
+    return (a0 < b0) || ((a0 == b0) && (a1 <= b1));
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
+| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
+| returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int lt128(Bit64u a0, Bit64u a1, Bit64u b0, Bit64u b1)
+{
+    return (a0 < b0) || ((a0 == b0) && (a1 < b1));
+}
+
+#endif	/* FLOATX80 */
+
+/*----------------------------------------------------------------------------
+| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
+| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
+| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
+| `z2Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void mul128By64To192(
+     Bit64u a0,
+     Bit64u a1,
+     Bit64u b,
+     Bit64u *z0Ptr,
+     Bit64u *z1Ptr,
+     Bit64u *z2Ptr
+)
+{
+    Bit64u z0, z1, z2, more1;
+
+    mul64To128(a1, b, &z1, &z2);
+    mul64To128(a0, b, &z0, &more1);
+    add128(z0, more1, 0, z1, &z0, &z1);
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
+| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
+| product.  The product is broken into four 64-bit pieces which are stored at
+| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void mul128To256(
+     Bit64u a0,
+     Bit64u a1,
+     Bit64u b0,
+     Bit64u b1,
+     Bit64u *z0Ptr,
+     Bit64u *z1Ptr,
+     Bit64u *z2Ptr,
+     Bit64u *z3Ptr
+)
+{
+    Bit64u z0, z1, z2, z3;
+    Bit64u more1, more2;
+
+    mul64To128(a1, b1, &z2, &z3);
+    mul64To128(a1, b0, &z1, &more2);
+    add128(z1, more2, 0, z2, &z1, &z2);
+    mul64To128(a0, b0, &z0, &more1);
+    add128(z0, more1, 0, z1, &z0, &z1);
+    mul64To128(a0, b1, &more1, &more2);
+    add128(more1, more2, 0, z2, &more1, &z2);
+    add128(z0, z1, 0, more1, &z0, &z1);
+    *z3Ptr = z3;
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+
+/*----------------------------------------------------------------------------
+| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
+| by 64 _plus_ the number of bits given in `count'.  The shifted result is
+| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
+| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
+| off form a third 64-bit result as follows:  The _last_ bit shifted off is
+| the most-significant bit of the extra result, and the other 63 bits of the
+| extra result are all zero if and only if _all_but_the_last_ bits shifted off
+| were all zero.  This extra result is stored in the location pointed to by
+| `z2Ptr'.  The value of `count' can be arbitrarily large.
+|     (This routine makes more sense if `a0', `a1', and `a2' are considered
+| to form a fixed-point value with binary point between `a1' and `a2'.  This
+| fixed-point value is shifted right by the number of bits given in `count',
+| and the integer part of the result is returned at the locations pointed to
+| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
+| corrupted as described above, and is returned at the location pointed to by
+| `z2Ptr'.)
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void shift128ExtraRightJamming(
+     Bit64u a0,
+     Bit64u a1,
+     Bit64u a2,
+     int count,
+     Bit64u *z0Ptr,
+     Bit64u *z1Ptr,
+     Bit64u *z2Ptr
+)
+{
+    Bit64u z0, z1, z2;
+    int negCount = (-count) & 63;
+
+    if (count == 0) {
+        z2 = a2;
+        z1 = a1;
+        z0 = a0;
+    }
+    else {
+        if (count < 64) {
+            z2 = a1<<negCount;
+            z1 = (a0<<negCount) | (a1>>count);
+            z0 = a0>>count;
+        }
+        else {
+            if (count == 64) {
+                z2 = a1;
+                z1 = a0;
+            }
+            else {
+                a2 |= a1;
+                if (count < 128) {
+                    z2 = a0<<negCount;
+                    z1 = a0>>(count & 63);
+                }
+                else {
+                    z2 = (count == 128) ? a0 : (a0 != 0);
+                    z1 = 0;
+                }
+            }
+            z0 = 0;
+        }
+        z2 |= (a2 != 0);
+    }
+    *z2Ptr = z2;
+    *z1Ptr = z1;
+    *z0Ptr = z0;
+}
+
+#endif  /* FLOAT128 */
+
+#endif
--- a/src/cpu/softfloat/softfloat-muladd.cc
+++ b/src/cpu/softfloat/softfloat-muladd.cc
@@ -0,0 +1,558 @@
+/*============================================================================
+This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * This code is based on QEMU patch by Peter Maydell
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#include "softfloat.h"
+#include "softfloat-round-pack.h"
+
+/*----------------------------------------------------------------------------
+| Primitive arithmetic functions, including multi-word arithmetic, and
+| division and square root approximations. (Can be specialized to target
+| if desired).
+*----------------------------------------------------------------------------*/
+#include "softfloat-macros.h"
+
+/*----------------------------------------------------------------------------
+| Functions and definitions to determine:  (1) whether tininess for underflow
+| is detected before or after rounding by default, (2) what (if anything)
+| happens when exceptions are raised, (3) how signaling NaNs are distinguished
+| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
+| are propagated from function inputs to output.  These details are target-
+| specific.
+*----------------------------------------------------------------------------*/
+#include "softfloat-specialize.h"
+
+/*----------------------------------------------------------------------------
+| Takes three single-precision floating-point values `a', `b' and `c', one of
+| which is a NaN, and returns the appropriate NaN result.  If any of  `a',
+| `b' or `c' is a signaling NaN, the invalid exception is raised.
+| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case
+| obviously c is a NaN, and whether to propagate c or some other NaN is
+| implementation defined).
+*----------------------------------------------------------------------------*/
+
+static float32 propagateFloat32MulAddNaN(float32 a, float32 b, float32 c, struct float_status_t *status)
+{
+    int aIsNaN = float32_is_nan(a);
+    int bIsNaN = float32_is_nan(b);
+
+    int aIsSignalingNaN = float32_is_signaling_nan(a);
+    int bIsSignalingNaN = float32_is_signaling_nan(b);
+    int cIsSignalingNaN = float32_is_signaling_nan(c);
+
+    a |= 0x00400000;
+    b |= 0x00400000;
+    c |= 0x00400000;
+
+    if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN)
+        float_raise(status, float_flag_invalid);
+
+    //  operate according to float_first_operand_nan mode
+    if (aIsSignalingNaN | aIsNaN) {
+        return a;
+    }
+    else {
+        return (bIsSignalingNaN | bIsNaN) ? b : c;
+    }
+}
+
+/*----------------------------------------------------------------------------
+| Takes three double-precision floating-point values `a', `b' and `c', one of
+| which is a NaN, and returns the appropriate NaN result.  If any of  `a',
+| `b' or `c' is a signaling NaN, the invalid exception is raised.
+| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case
+| obviously c is a NaN, and whether to propagate c or some other NaN is
+| implementation defined).
+*----------------------------------------------------------------------------*/
+
+static float64 propagateFloat64MulAddNaN(float64 a, float64 b, float64 c, struct float_status_t *status)
+{
+    int aIsNaN = float64_is_nan(a);
+    int bIsNaN = float64_is_nan(b);
+
+    int aIsSignalingNaN = float64_is_signaling_nan(a);
+    int bIsSignalingNaN = float64_is_signaling_nan(b);
+    int cIsSignalingNaN = float64_is_signaling_nan(c);
+
+    a |= BX_CONST64(0x0008000000000000);
+    b |= BX_CONST64(0x0008000000000000);
+    c |= BX_CONST64(0x0008000000000000);
+
+    if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN)
+        float_raise(status, float_flag_invalid);
+
+    //  operate according to float_first_operand_nan mode
+    if (aIsSignalingNaN | aIsNaN) {
+        return a;
+    }
+    else {
+        return (bIsSignalingNaN | bIsNaN) ? b : c;
+    }
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the single-precision floating-point values
+| `a' and `b' then adding 'c', with no intermediate rounding step after the
+| multiplication.  The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic 754-2008.
+| The flags argument allows the caller to select negation of the
+| addend, the intermediate product, or the final result. (The difference
+| between this and having the caller do a separate negation is that negating
+| externally will flip the sign bit on NaNs.)
+*----------------------------------------------------------------------------*/
+
+float32 float32_muladd(float32 a, float32 b, float32 c, int flags, struct float_status_t *status)
+{
+    int aSign, bSign, cSign, zSign;
+    Bit16s aExp, bExp, cExp, pExp, zExp;
+    Bit32u aSig, bSig, cSig;
+    int pInf, pZero, pSign;
+    Bit64u pSig64, cSig64, zSig64;
+    Bit32u pSig;
+    int shiftcount;
+
+    aSig = extractFloat32Frac(a);
+    aExp = extractFloat32Exp(a);
+    aSign = extractFloat32Sign(a);
+    bSig = extractFloat32Frac(b);
+    bExp = extractFloat32Exp(b);
+    bSign = extractFloat32Sign(b);
+    cSig = extractFloat32Frac(c);
+    cExp = extractFloat32Exp(c);
+    cSign = extractFloat32Sign(c);
+
+    /* It is implementation-defined whether the cases of (0,inf,qnan)
+     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+     * they return if they do), so we have to hand this information
+     * off to the target-specific pick-a-NaN routine.
+     */
+    if (((aExp == 0xff) && aSig) ||
+        ((bExp == 0xff) && bSig) ||
+        ((cExp == 0xff) && cSig)) {
+        return propagateFloat32MulAddNaN(a, b, c, status);
+    }
+
+    if (get_denormals_are_zeros(status)) {
+        if (aExp == 0) aSig = 0;
+        if (bExp == 0) bSig = 0;
+        if (cExp == 0) cSig = 0;
+    }
+
+    int infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
+                   (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
+
+    if (infzero) {
+        float_raise(status, float_flag_invalid);
+        return float32_default_nan;
+    }
+
+    if (flags & float_muladd_negate_c) {
+        cSign ^= 1;
+    }
+
+    /* Work out the sign and type of the product */
+    pSign = aSign ^ bSign;
+    if (flags & float_muladd_negate_product) {
+        pSign ^= 1;
+    }
+    pInf = (aExp == 0xff) || (bExp == 0xff);
+    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
+
+    if (cExp == 0xff) {
+        if (pInf && (pSign ^ cSign)) {
+            /* addition of opposite-signed infinities => InvalidOperation */
+            float_raise(status, float_flag_invalid);
+            return float32_default_nan;
+        }
+        /* Otherwise generate an infinity of the same sign */
+        if ((aSig && aExp == 0) || (bSig && bExp == 0)) {
+            float_raise(status, float_flag_denormal);
+        }
+        return packFloat32(cSign, 0xff, 0);
+    }
+
+    if (pInf) {
+        if ((aSig && aExp == 0) || (bSig && bExp == 0) || (cSig && cExp == 0)) {
+            float_raise(status, float_flag_denormal);
+        }
+        return packFloat32(pSign, 0xff, 0);
+    }
+
+    if (pZero) {
+        if (cExp == 0) {
+            if (cSig == 0) {
+                /* Adding two exact zeroes */
+                if (pSign == cSign) {
+                    zSign = pSign;
+                } else if (get_float_rounding_mode(status) == float_round_down) {
+                    zSign = 1;
+                } else {
+                    zSign = 0;
+                }
+                return packFloat32(zSign, 0, 0);
+            }
+            /* Exact zero plus a denormal */
+            float_raise(status, float_flag_denormal);
+            if (get_flush_underflow_to_zero(status)) {
+                float_raise(status, float_flag_underflow | float_flag_inexact);
+                return packFloat32(cSign, 0, 0);
+            }
+        }
+        /* Zero plus something non-zero */
+        return packFloat32(cSign, cExp, cSig);
+    }
+
+    if (aExp == 0) {
+        float_raise(status, float_flag_denormal);
+        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
+    }
+    if (bExp == 0) {
+        float_raise(status, float_flag_denormal);
+        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
+    }
+
+    /* Calculate the actual result a * b + c */
+
+    /* Multiply first; this is easy. */
+    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
+     * because we want the true exponent, not the "one-less-than"
+     * flavour that roundAndPackFloat32() takes.
+     */
+    pExp = aExp + bExp - 0x7e;
+    aSig = (aSig | 0x00800000) << 7;
+    bSig = (bSig | 0x00800000) << 8;
+    pSig64 = (Bit64u)aSig * bSig;
+    if ((Bit64s)(pSig64 << 1) >= 0) {
+        pSig64 <<= 1;
+        pExp--;
+    }
+
+    zSign = pSign;
+
+    /* Now pSig64 is the significand of the multiply, with the explicit bit in
+     * position 62.
+     */
+    if (cExp == 0) {
+        if (!cSig) {
+            /* Throw out the special case of c being an exact zero now */
+            pSig = (Bit32u) shift64RightJamming(pSig64, 32);
+            return roundAndPackFloat32(zSign, pExp - 1, pSig, status);
+        }
+        float_raise(status, float_flag_denormal);
+        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
+    }
+
+    cSig64 = (Bit64u)cSig << 39;
+    cSig64 |= BX_CONST64(0x4000000000000000);
+    int expDiff = pExp - cExp;
+
+    if (pSign == cSign) {
+        /* Addition */
+        if (expDiff > 0) {
+            /* scale c to match p */
+            cSig64 = shift64RightJamming(cSig64, expDiff);
+            zExp = pExp;
+        } else if (expDiff < 0) {
+            /* scale p to match c */
+            pSig64 = shift64RightJamming(pSig64, -expDiff);
+            zExp = cExp;
+        } else {
+            /* no scaling needed */
+            zExp = cExp;
+        }
+        /* Add significands and make sure explicit bit ends up in posn 62 */
+        zSig64 = pSig64 + cSig64;
+        if ((Bit64s)zSig64 < 0) {
+            zSig64 = shift64RightJamming(zSig64, 1);
+        } else {
+            zExp--;
+        }
+        zSig64 = shift64RightJamming(zSig64, 32);
+        return roundAndPackFloat32(zSign, zExp, zSig64, status);
+    } else {
+        /* Subtraction */
+        if (expDiff > 0) {
+            cSig64 = shift64RightJamming(cSig64, expDiff);
+            zSig64 = pSig64 - cSig64;
+            zExp = pExp;
+        } else if (expDiff < 0) {
+            pSig64 = shift64RightJamming(pSig64, -expDiff);
+            zSig64 = cSig64 - pSig64;
+            zExp = cExp;
+            zSign ^= 1;
+        } else {
+            zExp = pExp;
+            if (cSig64 < pSig64) {
+                zSig64 = pSig64 - cSig64;
+            } else if (pSig64 < cSig64) {
+                zSig64 = cSig64 - pSig64;
+                zSign ^= 1;
+            } else {
+                /* Exact zero */
+                return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0);
+            }
+        }
+        --zExp;
+        /* Do the equivalent of normalizeRoundAndPackFloat32() but
+         * starting with the significand in a Bit64u.
+         */
+        shiftcount = countLeadingZeros64(zSig64) - 1;
+        zSig64 <<= shiftcount;
+        zExp -= shiftcount;
+        zSig64 = shift64RightJamming(zSig64, 32);
+        return roundAndPackFloat32(zSign, zExp, zSig64, status);
+    }
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of multiplying the double-precision floating-point values
+| `a' and `b' then adding 'c', with no intermediate rounding step after the
+| multiplication.  The operation is performed according to the IEC/IEEE
+| Standard for Binary Floating-Point Arithmetic 754-2008.
+| The flags argument allows the caller to select negation of the
+| addend, the intermediate product, or the final result. (The difference
+| between this and having the caller do a separate negation is that negating
+| externally will flip the sign bit on NaNs.)
+*----------------------------------------------------------------------------*/
+
+float64 float64_muladd(float64 a, float64 b, float64 c, int flags, struct float_status_t *status)
+{
+    int aSign, bSign, cSign, zSign;
+    Bit16s aExp, bExp, cExp, pExp, zExp;
+    Bit64u aSig, bSig, cSig;
+    int pInf, pZero, pSign;
+    Bit64u pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
+    int shiftcount;
+
+    aSig = extractFloat64Frac(a);
+    aExp = extractFloat64Exp(a);
+    aSign = extractFloat64Sign(a);
+    bSig = extractFloat64Frac(b);
+    bExp = extractFloat64Exp(b);
+    bSign = extractFloat64Sign(b);
+    cSig = extractFloat64Frac(c);
+    cExp = extractFloat64Exp(c);
+    cSign = extractFloat64Sign(c);
+
+    /* It is implementation-defined whether the cases of (0,inf,qnan)
+     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+     * they return if they do), so we have to hand this information
+     * off to the target-specific pick-a-NaN routine.
+     */
+    if (((aExp == 0x7ff) && aSig) ||
+        ((bExp == 0x7ff) && bSig) ||
+        ((cExp == 0x7ff) && cSig)) {
+        return propagateFloat64MulAddNaN(a, b, c, status);
+    }
+
+    if (get_denormals_are_zeros(status)) {
+        if (aExp == 0) aSig = 0;
+        if (bExp == 0) bSig = 0;
+        if (cExp == 0) cSig = 0;
+    }
+
+    int infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
+                   (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
+
+    if (infzero) {
+        float_raise(status, float_flag_invalid);
+        return float64_default_nan;
+    }
+
+    if (flags & float_muladd_negate_c) {
+        cSign ^= 1;
+    }
+
+    /* Work out the sign and type of the product */
+    pSign = aSign ^ bSign;
+    if (flags & float_muladd_negate_product) {
+        pSign ^= 1;
+    }
+    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
+    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
+
+    if (cExp == 0x7ff) {
+        if (pInf && (pSign ^ cSign)) {
+            /* addition of opposite-signed infinities => InvalidOperation */
+            float_raise(status, float_flag_invalid);
+            return float64_default_nan;
+        }
+        /* Otherwise generate an infinity of the same sign */
+        if ((aSig && aExp == 0) || (bSig && bExp == 0)) {
+            float_raise(status, float_flag_denormal);
+        }
+        return packFloat64(cSign, 0x7ff, 0);
+    }
+
+    if (pInf) {
+        if ((aSig && aExp == 0) || (bSig && bExp == 0) || (cSig && cExp == 0)) {
+            float_raise(status, float_flag_denormal);
+        }
+        return packFloat64(pSign, 0x7ff, 0);
+    }
+
+    if (pZero) {
+        if (cExp == 0) {
+            if (cSig == 0) {
+                /* Adding two exact zeroes */
+                if (pSign == cSign) {
+                    zSign = pSign;
+                } else if (get_float_rounding_mode(status) == float_round_down) {
+                    zSign = 1;
+                } else {
+                    zSign = 0;
+                }
+                return packFloat64(zSign, 0, 0);
+            }
+            /* Exact zero plus a denormal */
+            float_raise(status, float_flag_denormal);
+            if (get_flush_underflow_to_zero(status)) {
+                float_raise(status, float_flag_underflow | float_flag_inexact);
+                return packFloat64(cSign, 0, 0);
+            }
+        }
+        /* Zero plus something non-zero */
+        return packFloat64(cSign, cExp, cSig);
+    }
+
+    if (aExp == 0) {
+        float_raise(status, float_flag_denormal);
+        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
+    }
+    if (bExp == 0) {
+        float_raise(status, float_flag_denormal);
+        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
+    }
+
+    /* Calculate the actual result a * b + c */
+
+    /* Multiply first; this is easy. */
+    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
+     * because we want the true exponent, not the "one-less-than"
+     * flavour that roundAndPackFloat64() takes.
+     */
+    pExp = aExp + bExp - 0x3fe;
+    aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
+    bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
+    mul64To128(aSig, bSig, &pSig0, &pSig1);
+    if ((Bit64s)(pSig0 << 1) >= 0) {
+        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
+        pExp--;
+    }
+
+    zSign = pSign;
+
+    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
+     * bit in position 126.
+     */
+    if (cExp == 0) {
+        if (!cSig) {
+            /* Throw out the special case of c being an exact zero now */
+            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
+            return roundAndPackFloat64(zSign, pExp - 1, pSig1, status);
+        }
+        float_raise(status, float_flag_denormal);
+        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
+    }
+
+    cSig0 = cSig << 10;
+    cSig1 = 0;
+    cSig0 |= BX_CONST64(0x4000000000000000);
+    int expDiff = pExp - cExp;
+
+    if (pSign == cSign) {
+        /* Addition */
+        if (expDiff > 0) {
+            /* scale c to match p */
+            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
+            zExp = pExp;
+        } else if (expDiff < 0) {
+            /* scale p to match c */
+            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
+            zExp = cExp;
+        } else {
+            /* no scaling needed */
+            zExp = cExp;
+        }
+        /* Add significands and make sure explicit bit ends up in posn 126 */
+        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+        if ((Bit64s)zSig0 < 0) {
+            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
+        } else {
+            zExp--;
+        }
+        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
+        return roundAndPackFloat64(zSign, zExp, zSig1, status);
+    } else {
+        /* Subtraction */
+        if (expDiff > 0) {
+            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
+            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+            zExp = pExp;
+        } else if (expDiff < 0) {
+            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
+            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
+            zExp = cExp;
+            zSign ^= 1;
+        } else {
+            zExp = pExp;
+            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
+                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
+            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
+                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
+                zSign ^= 1;
+            } else {
+                /* Exact zero */
+                return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0);
+            }
+        }
+        --zExp;
+        /* Do the equivalent of normalizeRoundAndPackFloat64() but
+         * starting with the significand in a pair of Bit64u.
+         */
+        if (zSig0) {
+            shiftcount = countLeadingZeros64(zSig0) - 1;
+            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
+            if (zSig1) {
+                zSig0 |= 1;
+            }
+            zExp -= shiftcount;
+        } else {
+            shiftcount = countLeadingZeros64(zSig1) - 1;
+            zSig0 = zSig1 << shiftcount;
+            zExp -= (shiftcount + 64);
+        }
+        return roundAndPackFloat64(zSign, zExp, zSig0, status);
+    }
+}
--- a/src/cpu/softfloat/softfloat-round-pack.cc
+++ b/src/cpu/softfloat/softfloat-round-pack.cc
@@ -0,0 +1,896 @@
+/*============================================================================
+This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+#define FLOAT128
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#include "softfloat.h"
+#include "softfloat-round-pack.h"
+
+/*----------------------------------------------------------------------------
+| Primitive arithmetic functions, including multi-word arithmetic, and
+| division and square root approximations. (Can be specialized to target
+| if desired).
+*----------------------------------------------------------------------------*/
+#include "softfloat-macros.h"
+
+/*----------------------------------------------------------------------------
+| Functions and definitions to determine:  (1) whether tininess for underflow
+| is detected before or after rounding by default, (2) what (if anything)
+| happens when exceptions are raised, (3) how signaling NaNs are distinguished
+| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
+| are propagated from function inputs to output.  These details are target-
+| specific.
+*----------------------------------------------------------------------------*/
+#include "softfloat-specialize.h"
+
+/*----------------------------------------------------------------------------
+| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
+| and 7, and returns the properly rounded 32-bit integer corresponding to the
+| input.  If `zSign' is 1, the input is negated before being converted to an
+| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
+| is simply rounded to an integer, with the inexact exception raised if the
+| input cannot be represented exactly as an integer.  However, if the fixed-
+| point input is too large, the invalid exception is raised and the integer
+| indefinite value is returned.
+*----------------------------------------------------------------------------*/
+
+Bit32s roundAndPackInt32(int zSign, Bit64u exactAbsZ, struct float_status_t *status)
+{
+    int roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    int roundIncrement = 0x40;
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) roundIncrement = 0;
+        else {
+            roundIncrement = 0x7F;
+            if (zSign) {
+                if (roundingMode == float_round_up) roundIncrement = 0;
+            }
+            else {
+                if (roundingMode == float_round_down) roundIncrement = 0;
+            }
+        }
+    }
+    int roundBits = (int)(exactAbsZ & 0x7F);
+    Bit64u absZ = (exactAbsZ + roundIncrement)>>7;
+    absZ &= ~(((roundBits ^ 0x40) == 0) & roundNearestEven);
+    Bit32s z = (Bit32s) absZ;
+    if (zSign) z = -z;
+    if ((absZ>>32) || (z && ((z < 0) ^ zSign))) {
+        float_raise(status, float_flag_invalid);
+        return (Bit32s)(int32_indefinite);
+    }
+    if (roundBits) {
+        float_raise(status, float_flag_inexact);
+        if ((absZ << 7) > exactAbsZ)
+            set_float_rounding_up(status);
+    }
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
+| `absZ1', with binary point between bits 63 and 64 (between the input words),
+| and returns the properly rounded 64-bit integer corresponding to the input.
+| If `zSign' is 1, the input is negated before being converted to an integer.
+| Ordinarily, the fixed-point input is simply rounded to an integer, with
+| the inexact exception raised if the input cannot be represented exactly as
+| an integer.  However, if the fixed-point input is too large, the invalid
+| exception is raised and the integer indefinite value is returned.
+*----------------------------------------------------------------------------*/
+
+Bit64s roundAndPackInt64(int zSign, Bit64u absZ0, Bit64u absZ1, struct float_status_t *status)
+{
+    Bit64s z;
+    int roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    int increment = ((Bit64s) absZ1 < 0);
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) increment = 0;
+        else {
+            if (zSign) {
+                increment = (roundingMode == float_round_down) && absZ1;
+            }
+            else {
+                increment = (roundingMode == float_round_up) && absZ1;
+            }
+        }
+    }
+    Bit64u exactAbsZ0 = absZ0;
+    if (increment) {
+        ++absZ0;
+        if (absZ0 == 0) goto overflow;
+        absZ0 &= ~(((Bit64u) (absZ1<<1) == 0) & roundNearestEven);
+    }
+    z = absZ0;
+    if (zSign) z = -z;
+    if (z && ((z < 0) ^ zSign)) {
+ overflow:
+        float_raise(status, float_flag_invalid);
+        return (Bit64s)(int64_indefinite);
+    }
+    if (absZ1) {
+        float_raise(status, float_flag_inexact);
+        if (absZ0 > exactAbsZ0)
+            set_float_rounding_up(status);
+    }
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
+| `absZ1', with binary point between bits 63 and 64 (between the input words),
+| and returns the properly rounded 64-bit unsigned integer corresponding to the
+| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
+| with the inexact exception raised if the input cannot be represented exactly
+| as an integer. However, if the fixed-point input is too large, the invalid
+| exception is raised and the largest unsigned integer is returned.
+*----------------------------------------------------------------------------*/
+
+Bit64u roundAndPackUint64(int zSign, Bit64u absZ0, Bit64u absZ1, struct float_status_t *status)
+{
+    int roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    int increment = ((Bit64s) absZ1 < 0);
+    if (!roundNearestEven) {
+        if (roundingMode == float_round_to_zero) {
+            increment = 0;
+        } else if (absZ1) {
+            if (zSign) {
+                increment = (roundingMode == float_round_down) && absZ1;
+            } else {
+                increment = (roundingMode == float_round_up) && absZ1;
+            }
+        }
+    }
+    if (increment) {
+        ++absZ0;
+        if (absZ0 == 0) {
+            float_raise(status, float_flag_invalid);
+            return uint64_indefinite;
+        }
+        absZ0 &= ~(((Bit64u) (absZ1<<1) == 0) & roundNearestEven);
+    }
+
+    if (zSign && absZ0) {
+        float_raise(status, float_flag_invalid);
+        return uint64_indefinite;
+    }
+
+    if (absZ1) {
+        float_raise(status, float_flag_inexact);
+    }
+    return absZ0;
+}
+
+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal half-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat16Subnormal(Bit16u aSig, Bit16s *zExpPtr, Bit16u *zSigPtr)
+{
+    int shiftCount = countLeadingZeros16(aSig) - 5;
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper half-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the half-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal single-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 14
+| and 13, which is 4 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float16 roundAndPackFloat16(int zSign, Bit16s zExp, Bit16u zSig, struct float_status_t *status)
+{
+    Bit16s roundIncrement, roundBits, roundMask;
+
+    int roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    roundIncrement = 8;
+    roundMask = 0xF;
+
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) roundIncrement = 0;
+        else {
+            roundIncrement = roundMask;
+            if (zSign) {
+                if (roundingMode == float_round_up) roundIncrement = 0;
+            }
+            else {
+                if (roundingMode == float_round_down) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = zSig & roundMask;
+    if (0x1D <= (Bit16u) zExp) {
+        if ((0x1D < zExp)
+             || ((zExp == 0x1D) && ((Bit16s) (zSig + roundIncrement) < 0)))
+        {
+            float_raise(status, float_flag_overflow);
+            if (roundBits || float_exception_masked(status, float_flag_overflow)) {
+                float_raise(status, float_flag_inexact);
+            }
+            return packFloat16(zSign, 0x1F, 0) - (roundIncrement == 0);
+        }
+        if (zExp < 0) {
+            int isTiny = (zExp < -1) || (zSig + roundIncrement < 0x8000);
+            zSig = shift16RightJamming(zSig, -zExp);
+            zExp = 0;
+            roundBits = zSig & roundMask;
+            if (isTiny) {
+                if(get_flush_underflow_to_zero(status)) {
+                    float_raise(status, float_flag_underflow | float_flag_inexact);
+                    return packFloat16(zSign, 0, 0);
+                }
+                // signal the #P according to roundBits calculated AFTER denormalization
+                if (roundBits || !float_exception_masked(status, float_flag_underflow)) {
+                    float_raise(status, float_flag_underflow);
+                }
+            }
+        }
+    }
+    if (roundBits) float_raise(status, float_flag_inexact);
+    Bit16u zSigRound = ((zSig + roundIncrement) & ~roundMask) >> 4;
+    zSigRound &= ~(((roundBits ^ 0x10) == 0) & roundNearestEven);
+    if (zSigRound == 0) zExp = 0;
+    return packFloat16(zSign, zExp, zSigRound);
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal single-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat32Subnormal(Bit32u aSig, Bit16s *zExpPtr, Bit32u *zSigPtr)
+{
+    int shiftCount = countLeadingZeros32(aSig) - 8;
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper single-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the single-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal single-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 30
+| and 29, which is 7 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 roundAndPackFloat32(int zSign, Bit16s zExp, Bit32u zSig, struct float_status_t *status)
+{
+    Bit32s roundIncrement, roundBits;
+    const Bit32s roundMask = 0x7F;
+
+    int roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    roundIncrement = 0x40;
+
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) roundIncrement = 0;
+        else {
+            roundIncrement = roundMask;
+            if (zSign) {
+                if (roundingMode == float_round_up) roundIncrement = 0;
+            }
+            else {
+                if (roundingMode == float_round_down) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = zSig & roundMask;
+    if (0xFD <= (Bit16u) zExp) {
+        if ((0xFD < zExp)
+             || ((zExp == 0xFD) && ((Bit32s) (zSig + roundIncrement) < 0)))
+        {
+            float_raise(status, float_flag_overflow);
+            if (roundBits || float_exception_masked(status, float_flag_overflow)) {
+                float_raise(status, float_flag_inexact);
+                if (roundIncrement != 0) set_float_rounding_up(status);
+            }
+            return packFloat32(zSign, 0xFF, 0) - (roundIncrement == 0);
+        }
+        if (zExp < 0) {
+            int isTiny = (zExp < -1) || (zSig + roundIncrement < 0x80000000);
+            if (isTiny) {
+                if (!float_exception_masked(status, float_flag_underflow)) {
+                    float_raise(status, float_flag_underflow);
+                    zExp += 192; // bias unmasked underflow
+                }
+            }
+            if (zExp < 0) {
+                zSig = shift32RightJamming(zSig, -zExp);
+                zExp = 0;
+                roundBits = zSig & roundMask;
+                if (isTiny) {
+                    // masked underflow
+                    if(get_flush_underflow_to_zero(status)) {
+                        float_raise(status, float_flag_underflow | float_flag_inexact);
+                        return packFloat32(zSign, 0, 0);
+                    }
+                    if (roundBits) float_raise(status, float_flag_underflow);
+                }
+            }
+        }
+    }
+    Bit32u zSigRound = ((zSig + roundIncrement) & ~roundMask) >> 7;
+    zSigRound &= ~(((roundBits ^ 0x40) == 0) & roundNearestEven);
+    if (zSigRound == 0) zExp = 0;
+    if (roundBits) {
+        float_raise(status, float_flag_inexact);
+        if ((zSigRound << 7) > zSig) set_float_rounding_up(status);
+    }
+    return packFloat32(zSign, zExp, zSigRound);
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper single-precision floating-
+| point value corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
+| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
+| floating-point exponent.
+*----------------------------------------------------------------------------*/
+
+float32 normalizeRoundAndPackFloat32(int zSign, Bit16s zExp, Bit32u zSig, struct float_status_t *status)
+{
+    int shiftCount = countLeadingZeros32(zSig) - 1;
+    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, status);
+}
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal double-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat64Subnormal(Bit64u aSig, Bit16s *zExpPtr, Bit64u *zSigPtr)
+{
+    int shiftCount = countLeadingZeros64(aSig) - 11;
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the double-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded
+| to a subnormal number, and the underflow and inexact exceptions are raised
+| if the abstract input cannot be represented exactly as a subnormal double-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 62
+| and 61, which is 10 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 roundAndPackFloat64(int zSign, Bit16s zExp, Bit64u zSig, struct float_status_t *status)
+{
+    Bit16s roundIncrement, roundBits;
+    const Bit16s roundMask = 0x3FF;
+    int roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    roundIncrement = 0x200;
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) roundIncrement = 0;
+        else {
+            roundIncrement = roundMask;
+            if (zSign) {
+                if (roundingMode == float_round_up) roundIncrement = 0;
+            }
+            else {
+                if (roundingMode == float_round_down) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = (Bit16s)(zSig & roundMask);
+    if (0x7FD <= (Bit16u) zExp) {
+        if ((0x7FD < zExp)
+             || ((zExp == 0x7FD)
+                  && ((Bit64s) (zSig + roundIncrement) < 0)))
+        {
+            float_raise(status, float_flag_overflow);
+            if (roundBits || float_exception_masked(status, float_flag_overflow)) {
+                float_raise(status, float_flag_inexact);
+                if (roundIncrement != 0) set_float_rounding_up(status);
+            }
+            return packFloat64(zSign, 0x7FF, 0) - (roundIncrement == 0);
+        }
+        if (zExp < 0) {
+            int isTiny = (zExp < -1) || (zSig + roundIncrement < BX_CONST64(0x8000000000000000));
+            if (isTiny) {
+                if (!float_exception_masked(status, float_flag_underflow)) {
+                    float_raise(status, float_flag_underflow);
+                    zExp += 1536; // bias unmasked underflow
+                }
+            }
+            if (zExp < 0) {
+                zSig = shift64RightJamming(zSig, -zExp);
+                zExp = 0;
+                roundBits = (Bit16s)(zSig & roundMask);
+                if (isTiny) {
+                    // masked underflow
+                    if(get_flush_underflow_to_zero(status)) {
+                        float_raise(status, float_flag_underflow | float_flag_inexact);
+                        return packFloat64(zSign, 0, 0);
+                    }
+                    if (roundBits) float_raise(status, float_flag_underflow);
+                }
+            }
+        }
+    }
+    Bit64u zSigRound = (zSig + roundIncrement)>>10;
+    zSigRound &= ~(((roundBits ^ 0x200) == 0) & roundNearestEven);
+    if (zSigRound == 0) zExp = 0;
+    if (roundBits) {
+        float_raise(status, float_flag_inexact);
+        if ((zSigRound << 10) > zSig) set_float_rounding_up(status);
+    }
+    return packFloat64(zSign, zExp, zSigRound);
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
+| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
+| floating-point exponent.
+*----------------------------------------------------------------------------*/
+
+float64 normalizeRoundAndPackFloat64(int zSign, Bit16s zExp, Bit64u zSig, struct float_status_t *status)
+{
+    int shiftCount = countLeadingZeros64(zSig) - 1;
+    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, status);
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal extended double-precision floating-point value
+| represented by the denormalized significand `aSig'.  The normalized exponent
+| and significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloatx80Subnormal(Bit64u aSig, Bit32s *zExpPtr, Bit64u *zSigPtr)
+{
+    int shiftCount = countLeadingZeros64(aSig);
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and extended significand formed by the concatenation of `zSig0' and `zSig1',
+| and returns the proper extended double-precision floating-point value
+| corresponding to the abstract input.  Ordinarily, the abstract value is
+| rounded and packed into the extended double-precision format, with the
+| inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal extended
+| double-precision floating-point number.
+|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
+| number of bits as single or double precision, respectively.  Otherwise, the
+| result is rounded to the full precision of the extended double-precision
+| format.
+|     The input significand must be normalized or smaller.  If the input
+| significand is not normalized, `zExp' must be 0; in that case, the result
+| returned is a subnormal number, and it must not require rounding.  The
+| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 SoftFloatRoundAndPackFloatx80(int roundingPrecision,
+        int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, struct float_status_t *status)
+{
+    Bit64u roundIncrement, roundMask, roundBits;
+    int increment;
+    Bit64u zSigExact; /* support rounding-up response */
+
+    Bit8u roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    if (roundingPrecision == 64) {
+        roundIncrement = BX_CONST64(0x0000000000000400);
+        roundMask = BX_CONST64(0x00000000000007FF);
+    }
+    else if (roundingPrecision == 32) {
+        roundIncrement = BX_CONST64(0x0000008000000000);
+        roundMask = BX_CONST64(0x000000FFFFFFFFFF);
+    }
+    else goto precision80;
+
+    zSig0 |= (zSig1 != 0);
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) roundIncrement = 0;
+        else {
+            roundIncrement = roundMask;
+            if (zSign) {
+                if (roundingMode == float_round_up) roundIncrement = 0;
+            }
+            else {
+                if (roundingMode == float_round_down) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = zSig0 & roundMask;
+    if (0x7FFD <= (Bit32u) (zExp - 1)) {
+        if ((0x7FFE < zExp)
+             || ((zExp == 0x7FFE) && (zSig0 + roundIncrement < zSig0)))
+        {
+            goto overflow;
+        }
+        if (zExp <= 0) {
+            int isTiny = (zExp < 0) || (zSig0 <= zSig0 + roundIncrement);
+            zSig0 = shift64RightJamming(zSig0, 1 - zExp);
+            zSigExact = zSig0;
+            zExp = 0;
+            roundBits = zSig0 & roundMask;
+            if (isTiny) {
+                if (roundBits || (zSig0 && !float_exception_masked(status, float_flag_underflow)))
+                    float_raise(status, float_flag_underflow);
+            }
+            zSig0 += roundIncrement;
+            if ((Bit64s) zSig0 < 0) zExp = 1;
+            roundIncrement = roundMask + 1;
+            if (roundNearestEven && (roundBits<<1 == roundIncrement))
+                roundMask |= roundIncrement;
+            zSig0 &= ~roundMask;
+            if (roundBits) {
+                float_raise(status, float_flag_inexact);
+                if (zSig0 > zSigExact) set_float_rounding_up(status);
+            }
+            return packFloatx80(zSign, zExp, zSig0);
+        }
+    }
+    if (roundBits) float_raise(status, float_flag_inexact);
+    zSigExact = zSig0;
+    zSig0 += roundIncrement;
+    if (zSig0 < roundIncrement) {
+        // Basically scale by shifting right and keep overflow
+        ++zExp;
+        zSig0 = BX_CONST64(0x8000000000000000);
+        zSigExact >>= 1; // must scale also, or else later tests will fail
+    }
+    roundIncrement = roundMask + 1;
+    if (roundNearestEven && (roundBits<<1 == roundIncrement))
+        roundMask |= roundIncrement;
+    zSig0 &= ~roundMask;
+    if (zSig0 > zSigExact) set_float_rounding_up(status);
+    if (zSig0 == 0) zExp = 0;
+    return packFloatx80(zSign, zExp, zSig0);
+ precision80:
+    increment = ((Bit64s) zSig1 < 0);
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) increment = 0;
+        else {
+            if (zSign) {
+                increment = (roundingMode == float_round_down) && zSig1;
+            }
+            else {
+                increment = (roundingMode == float_round_up) && zSig1;
+            }
+        }
+    }
+    if (0x7FFD <= (Bit32u) (zExp - 1)) {
+        if ((0x7FFE < zExp)
+             || ((zExp == 0x7FFE)
+                  && (zSig0 == BX_CONST64(0xFFFFFFFFFFFFFFFF))
+                  && increment))
+        {
+            roundMask = 0;
+ overflow:
+            float_raise(status, float_flag_overflow | float_flag_inexact);
+            if ((roundingMode == float_round_to_zero)
+                 || (zSign && (roundingMode == float_round_up))
+                 || (! zSign && (roundingMode == float_round_down)))
+            {
+                return packFloatx80(zSign, 0x7FFE, ~roundMask);
+            }
+            set_float_rounding_up(status);
+            return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+        }
+        if (zExp <= 0) {
+            int isTiny = (zExp < 0) || (! increment)
+                || (zSig0 < BX_CONST64(0xFFFFFFFFFFFFFFFF));
+            shift64ExtraRightJamming(zSig0, zSig1, 1 - zExp, &zSig0, &zSig1);
+            zExp = 0;
+            if (isTiny) {
+                if (zSig1 || (zSig0 && !float_exception_masked(status, float_flag_underflow)))
+                    float_raise(status, float_flag_underflow);
+            }
+            if (zSig1) float_raise(status, float_flag_inexact);
+            if (roundNearestEven) increment = ((Bit64s) zSig1 < 0);
+            else {
+                if (zSign) {
+                    increment = (roundingMode == float_round_down) && zSig1;
+                } else {
+                    increment = (roundingMode == float_round_up) && zSig1;
+                }
+            }
+            if (increment) {
+                zSigExact = zSig0++;
+                zSig0 &= ~(((Bit64u) (zSig1<<1) == 0) & roundNearestEven);
+                if (zSig0 > zSigExact) set_float_rounding_up(status);
+                if ((Bit64s) zSig0 < 0) zExp = 1;
+            }
+            return packFloatx80(zSign, zExp, zSig0);
+        }
+    }
+    if (zSig1) float_raise(status, float_flag_inexact);
+    if (increment) {
+        zSigExact = zSig0++;
+        if (zSig0 == 0) {
+            zExp++;
+            zSig0 = BX_CONST64(0x8000000000000000);
+            zSigExact >>= 1;  // must scale also, or else later tests will fail
+        }
+        else {
+            zSig0 &= ~(((Bit64u) (zSig1<<1) == 0) & roundNearestEven);
+        }
+        if (zSig0 > zSigExact) set_float_rounding_up(status);
+    }
+    else {
+        if (zSig0 == 0) zExp = 0;
+    }
+    return packFloatx80(zSign, zExp, zSig0);
+}
+
+floatx80 roundAndPackFloatx80(int roundingPrecision,
+        int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, struct float_status_t *status)
+{
+    struct float_status_t *round_status = status;
+    floatx80 result = SoftFloatRoundAndPackFloatx80(roundingPrecision, zSign, zExp, zSig0, zSig1, status);
+
+    // bias unmasked undeflow
+    if (status->float_exception_flags & ~status->float_exception_masks & float_flag_underflow) {
+       float_raise(round_status, float_flag_underflow);
+       return SoftFloatRoundAndPackFloatx80(roundingPrecision, zSign, zExp + 0x6000, zSig0, zSig1, status = round_status);
+    }
+
+    // bias unmasked overflow
+    if (status->float_exception_flags & ~status->float_exception_masks & float_flag_overflow) {
+       float_raise(round_status, float_flag_overflow);
+       return SoftFloatRoundAndPackFloatx80(roundingPrecision, zSign, zExp - 0x6000, zSig0, zSig1, status = round_status);
+    }
+
+    return result;
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent
+| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
+| and returns the proper extended double-precision floating-point value
+| corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloatx80' except that the input significand does not have to be
+| normalized.
+*----------------------------------------------------------------------------*/
+
+floatx80 normalizeRoundAndPackFloatx80(int roundingPrecision,
+        int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, struct float_status_t *status)
+{
+    if (zSig0 == 0) {
+        zSig0 = zSig1;
+        zSig1 = 0;
+        zExp -= 64;
+    }
+    int shiftCount = countLeadingZeros64(zSig0);
+    shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1);
+    zExp -= shiftCount;
+    return
+        roundAndPackFloatx80(roundingPrecision, zSign, zExp, zSig0, zSig1, status);
+}
+
+#endif
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal quadruple-precision floating-point value
+| represented by the denormalized significand formed by the concatenation of
+| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
+| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
+| significand are stored at the location pointed to by `zSig0Ptr', and the
+| least significant 64 bits of the normalized significand are stored at the
+| location pointed to by `zSig1Ptr'.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat128Subnormal(
+     Bit64u aSig0, Bit64u aSig1, Bit32s *zExpPtr, Bit64u *zSig0Ptr, Bit64u *zSig1Ptr)
+{
+    int shiftCount;
+
+    if (aSig0 == 0) {
+        shiftCount = countLeadingZeros64(aSig1) - 15;
+        if (shiftCount < 0) {
+            *zSig0Ptr = aSig1 >>(-shiftCount);
+            *zSig1Ptr = aSig1 << (shiftCount & 63);
+        }
+        else {
+            *zSig0Ptr = aSig1 << shiftCount;
+            *zSig1Ptr = 0;
+        }
+        *zExpPtr = - shiftCount - 63;
+    }
+    else {
+        shiftCount = countLeadingZeros64(aSig0) - 15;
+        shortShift128Left(aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr);
+        *zExpPtr = 1 - shiftCount;
+    }
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and extended significand formed by the concatenation of `zSig0', `zSig1',
+| and `zSig2', and returns the proper quadruple-precision floating-point value
+| corresponding to the abstract input.  Ordinarily, the abstract value is
+| simply rounded and packed into the quadruple-precision format, with the
+| inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal quadruple-
+| precision floating-point number.
+|     The input significand must be normalized or smaller.  If the input
+| significand is not normalized, `zExp' must be 0; in that case, the result
+| returned is a subnormal number, and it must not require rounding.  In the
+| usual case that the input significand is normalized, `zExp' must be 1 less
+| than the ``true'' floating-point exponent.  The handling of underflow and
+| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 roundAndPackFloat128(
+     int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, Bit64u zSig2, struct float_status_t *status)
+{
+    int increment = ((Bit64s) zSig2 < 0);
+    if (0x7FFD <= (Bit32u) zExp) {
+        if ((0x7FFD < zExp)
+             || ((zExp == 0x7FFD)
+                  && eq128(BX_CONST64(0x0001FFFFFFFFFFFF),
+                         BX_CONST64(0xFFFFFFFFFFFFFFFF), zSig0, zSig1)
+                  && increment))
+        {
+            float_raise(status, float_flag_overflow | float_flag_inexact);
+            return packFloat128Four(zSign, 0x7FFF, 0, 0);
+        }
+        if (zExp < 0) {
+            int isTiny = (zExp < -1)
+                || ! increment
+                || lt128(zSig0, zSig1,
+                       BX_CONST64(0x0001FFFFFFFFFFFF),
+                       BX_CONST64(0xFFFFFFFFFFFFFFFF));
+            shift128ExtraRightJamming(
+                zSig0, zSig1, zSig2, -zExp, &zSig0, &zSig1, &zSig2);
+            zExp = 0;
+            if (isTiny && zSig2) float_raise(status, float_flag_underflow);
+            increment = ((Bit64s) zSig2 < 0);
+        }
+    }
+    if (zSig2) float_raise(status, float_flag_inexact);
+    if (increment) {
+        add128(zSig0, zSig1, 0, 1, &zSig0, &zSig1);
+        zSig1 &= ~((zSig2 + zSig2 == 0) & 1);
+    }
+    else {
+        if ((zSig0 | zSig1) == 0) zExp = 0;
+    }
+    return packFloat128Four(zSign, zExp, zSig0, zSig1);
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand formed by the concatenation of `zSig0' and `zSig1', and
+| returns the proper quadruple-precision floating-point value corresponding
+| to the abstract input.  This routine is just like `roundAndPackFloat128'
+| except that the input significand has fewer bits and does not have to be
+| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
+| point exponent.
+*----------------------------------------------------------------------------*/
+
+float128 normalizeRoundAndPackFloat128(
+     int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, struct float_status_t *status)
+{
+    Bit64u zSig2;
+
+    if (zSig0 == 0) {
+        zSig0 = zSig1;
+        zSig1 = 0;
+        zExp -= 64;
+    }
+    int shiftCount = countLeadingZeros64(zSig0) - 15;
+    if (0 <= shiftCount) {
+        zSig2 = 0;
+        shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1);
+    }
+    else {
+        shift128ExtraRightJamming(
+            zSig0, zSig1, 0, -shiftCount, &zSig0, &zSig1, &zSig2);
+    }
+    zExp -= shiftCount;
+    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
+}
+
+#endif
--- a/src/cpu/softfloat/softfloat-round-pack.h
+++ b/src/cpu/softfloat/softfloat-round-pack.h
@@ -0,0 +1,309 @@
+/*============================================================================
+This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#ifndef _SOFTFLOAT_ROUND_PACK_H_
+#define _SOFTFLOAT_ROUND_PACK_H_
+
+#include "softfloat.h"
+
+/*----------------------------------------------------------------------------
+| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
+| and 7, and returns the properly rounded 32-bit integer corresponding to the
+| input.  If `zSign' is 1, the input is negated before being converted to an
+| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
+| is simply rounded to an integer, with the inexact exception raised if the
+| input cannot be represented exactly as an integer.  However, if the fixed-
+| point input is too large, the invalid exception is raised and the integer
+| indefinite value is returned.
+*----------------------------------------------------------------------------*/
+
+Bit32s roundAndPackInt32(int zSign, Bit64u absZ, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
+| `absZ1', with binary point between bits 63 and 64 (between the input words),
+| and returns the properly rounded 64-bit integer corresponding to the input.
+| If `zSign' is 1, the input is negated before being converted to an integer.
+| Ordinarily, the fixed-point input is simply rounded to an integer, with
+| the inexact exception raised if the input cannot be represented exactly as
+| an integer.  However, if the fixed-point input is too large, the invalid
+| exception is raised and the integer indefinite value is returned.
+*----------------------------------------------------------------------------*/
+
+Bit64s roundAndPackInt64(int zSign, Bit64u absZ0, Bit64u absZ1, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
+| `absZ1', with binary point between bits 63 and 64 (between the input words),
+| and returns the properly rounded 64-bit unsigned integer corresponding to the
+| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
+| with the inexact exception raised if the input cannot be represented exactly
+| as an integer. However, if the fixed-point input is too large, the invalid
+| exception is raised and the largest unsigned integer is returned.
+*----------------------------------------------------------------------------*/
+
+Bit64u roundAndPackUint64(int zSign, Bit64u absZ0, Bit64u absZ1, struct float_status_t *status);
+
+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal half-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat16Subnormal(Bit16u aSig, Bit16s *zExpPtr, Bit16u *zSigPtr);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper half-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the half-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal single-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 14
+| and 13, which is 4 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float16 roundAndPackFloat16(int zSign, Bit16s zExp, Bit16u zSig, struct float_status_t *status);
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal single-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat32Subnormal(Bit32u aSig, Bit16s *zExpPtr, Bit32u *zSigPtr);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper single-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the single-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal single-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 30
+| and 29, which is 7 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 roundAndPackFloat32(int zSign, Bit16s zExp, Bit32u zSig, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper single-precision floating-
+| point value corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
+| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
+| floating-point exponent.
+*----------------------------------------------------------------------------*/
+
+float32 normalizeRoundAndPackFloat32(int zSign, Bit16s zExp, Bit32u zSig, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal double-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat64Subnormal(Bit64u aSig, Bit16s *zExpPtr, Bit64u *zSigPtr);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the double-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded
+| to a subnormal number, and the underflow and inexact exceptions are raised
+| if the abstract input cannot be represented exactly as a subnormal double-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 62
+| and 61, which is 10 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 roundAndPackFloat64(int zSign, Bit16s zExp, Bit64u zSig, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
+| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
+| floating-point exponent.
+*----------------------------------------------------------------------------*/
+
+float64 normalizeRoundAndPackFloat64(int zSign, Bit16s zExp, Bit64u zSig, struct float_status_t *status);
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal extended double-precision floating-point value
+| represented by the denormalized significand `aSig'.  The normalized exponent
+| and significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloatx80Subnormal(Bit64u aSig, Bit32s *zExpPtr, Bit64u *zSigPtr);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and extended significand formed by the concatenation of `zSig0' and `zSig1',
+| and returns the proper extended double-precision floating-point value
+| corresponding to the abstract input.  Ordinarily, the abstract value is
+| rounded and packed into the extended double-precision format, with the
+| inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal extended
+| double-precision floating-point number.
+|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
+| number of bits as single or double precision, respectively.  Otherwise, the
+| result is rounded to the full precision of the extended double-precision
+| format.
+|     The input significand must be normalized or smaller.  If the input
+| significand is not normalized, `zExp' must be 0; in that case, the result
+| returned is a subnormal number, and it must not require rounding.  The
+| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 roundAndPackFloatx80(int roundingPrecision,
+        int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent
+| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
+| and returns the proper extended double-precision floating-point value
+| corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloatx80' except that the input significand does not have to be
+| normalized.
+*----------------------------------------------------------------------------*/
+
+floatx80 normalizeRoundAndPackFloatx80(int roundingPrecision,
+        int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, struct float_status_t *status);
+
+#endif // FLOATX80
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal quadruple-precision floating-point value
+| represented by the denormalized significand formed by the concatenation of
+| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
+| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
+| significand are stored at the location pointed to by `zSig0Ptr', and the
+| least significant 64 bits of the normalized significand are stored at the
+| location pointed to by `zSig1Ptr'.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat128Subnormal(
+     Bit64u aSig0, Bit64u aSig1, Bit32s *zExpPtr, Bit64u *zSig0Ptr, Bit64u *zSig1Ptr);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and extended significand formed by the concatenation of `zSig0', `zSig1',
+| and `zSig2', and returns the proper quadruple-precision floating-point value
+| corresponding to the abstract input.  Ordinarily, the abstract value is
+| simply rounded and packed into the quadruple-precision format, with the
+| inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal quadruple-
+| precision floating-point number.
+|     The input significand must be normalized or smaller.  If the input
+| significand is not normalized, `zExp' must be 0; in that case, the result
+| returned is a subnormal number, and it must not require rounding.  In the
+| usual case that the input significand is normalized, `zExp' must be 1 less
+| than the ``true'' floating-point exponent.  The handling of underflow and
+| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float128 roundAndPackFloat128(
+     int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, Bit64u zSig2, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand formed by the concatenation of `zSig0' and `zSig1', and
+| returns the proper quadruple-precision floating-point value corresponding
+| to the abstract input.  This routine is just like `roundAndPackFloat128'
+| except that the input significand has fewer bits and does not have to be
+| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
+| point exponent.
+*----------------------------------------------------------------------------*/
+
+float128 normalizeRoundAndPackFloat128(
+     int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1, struct float_status_t *status);
+
+#endif // FLOAT128
+
+#endif
--- a/src/cpu/softfloat/softfloat-specialize.cc
+++ b/src/cpu/softfloat/softfloat-specialize.cc
@@ -0,0 +1,187 @@
+/*============================================================================
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+#define FLOAT128
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#include "softfloat.h"
+#include "softfloat-specialize.h"
+
+/*----------------------------------------------------------------------------
+| Takes two single-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+float32 propagateFloat32NaN(float32 a, float32 b, struct float_status_t *status)
+{
+    int aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+
+    aIsNaN = float32_is_nan(a);
+    aIsSignalingNaN = float32_is_signaling_nan(a);
+    bIsNaN = float32_is_nan(b);
+    bIsSignalingNaN = float32_is_signaling_nan(b);
+    a |= 0x00400000;
+    b |= 0x00400000;
+    if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid);
+    if (get_float_nan_handling_mode(status) == float_larger_significand_nan) {
+        if (aIsSignalingNaN) {
+            if (bIsSignalingNaN) goto returnLargerSignificand;
+            return bIsNaN ? b : a;
+        }
+        else if (aIsNaN) {
+            if (bIsSignalingNaN | ! bIsNaN) return a;
+      returnLargerSignificand:
+            if ((Bit32u) (a<<1) < (Bit32u) (b<<1)) return b;
+            if ((Bit32u) (b<<1) < (Bit32u) (a<<1)) return a;
+            return (a < b) ? a : b;
+        }
+        else {
+            return b;
+        }
+    } else {
+        return (aIsSignalingNaN | aIsNaN) ? a : b;
+    }
+}
+
+/*----------------------------------------------------------------------------
+| Takes two double-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+float64 propagateFloat64NaN(float64 a, float64 b, struct float_status_t *status)
+{
+    int aIsNaN = float64_is_nan(a);
+    int aIsSignalingNaN = float64_is_signaling_nan(a);
+    int bIsNaN = float64_is_nan(b);
+    int bIsSignalingNaN = float64_is_signaling_nan(b);
+    a |= BX_CONST64(0x0008000000000000);
+    b |= BX_CONST64(0x0008000000000000);
+    if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid);
+    if (get_float_nan_handling_mode(status) == float_larger_significand_nan) {
+        if (aIsSignalingNaN) {
+            if (bIsSignalingNaN) goto returnLargerSignificand;
+            return bIsNaN ? b : a;
+        }
+        else if (aIsNaN) {
+            if (bIsSignalingNaN | ! bIsNaN) return a;
+      returnLargerSignificand:
+            if ((Bit64u) (a<<1) < (Bit64u) (b<<1)) return b;
+            if ((Bit64u) (b<<1) < (Bit64u) (a<<1)) return a;
+            return (a < b) ? a : b;
+        }
+        else {
+            return b;
+        }
+    } else {
+        return (aIsSignalingNaN | aIsNaN) ? a : b;
+    }
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| Takes two extended double-precision floating-point values `a' and `b', one
+| of which is a NaN, and returns the appropriate NaN result.  If either `a' or
+| `b' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+floatx80 propagateFloatx80NaN(floatx80 a, floatx80 b, struct float_status_t *status)
+{
+    int aIsNaN = floatx80_is_nan(a);
+    int aIsSignalingNaN = floatx80_is_signaling_nan(a);
+    int bIsNaN = floatx80_is_nan(b);
+    int bIsSignalingNaN = floatx80_is_signaling_nan(b);
+    a.fraction |= BX_CONST64(0xC000000000000000);
+    b.fraction |= BX_CONST64(0xC000000000000000);
+    if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid);
+    if (aIsSignalingNaN) {
+        if (bIsSignalingNaN) goto returnLargerSignificand;
+        return bIsNaN ? b : a;
+    }
+    else if (aIsNaN) {
+        if (bIsSignalingNaN | ! bIsNaN) return a;
+ returnLargerSignificand:
+        if (a.fraction < b.fraction) return b;
+        if (b.fraction < a.fraction) return a;
+        return (a.exp < b.exp) ? a : b;
+    }
+    else {
+        return b;
+    }
+}
+
+#endif /* FLOATX80 */
+
+#ifdef FLOAT128
+
+/*----------------------------------------------------------------------------
+| Takes two quadruple-precision floating-point values `a' and `b', one of
+| which is a NaN, and returns the appropriate NaN result.  If either `a' or
+| `b' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+float128 propagateFloat128NaN(float128 a, float128 b, struct float_status_t *status)
+{
+    int aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+    aIsNaN = float128_is_nan(a);
+    aIsSignalingNaN = float128_is_signaling_nan(a);
+    bIsNaN = float128_is_nan(b);
+    bIsSignalingNaN = float128_is_signaling_nan(b);
+    a.hi |= BX_CONST64(0x0000800000000000);
+    b.hi |= BX_CONST64(0x0000800000000000);
+    if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid);
+    if (aIsSignalingNaN) {
+        if (bIsSignalingNaN) goto returnLargerSignificand;
+        return bIsNaN ? b : a;
+    }
+    else if (aIsNaN) {
+        if (bIsSignalingNaN | !bIsNaN) return a;
+ returnLargerSignificand:
+        if (lt128(a.hi<<1, a.lo, b.hi<<1, b.lo)) return b;
+        if (lt128(b.hi<<1, b.lo, a.hi<<1, a.lo)) return a;
+        return (a.hi < b.hi) ? a : b;
+    }
+    else {
+        return b;
+    }
+}
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated quadruple-precision NaN.
+*----------------------------------------------------------------------------*/
+const float128 float128_default_nan =
+    packFloat128(float128_default_nan_hi, float128_default_nan_lo);
+
+#endif /* FLOAT128 */
--- a/src/cpu/softfloat/softfloat-specialize.h
+++ b/src/cpu/softfloat/softfloat-specialize.h
@@ -0,0 +1,788 @@
+/*============================================================================
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+#ifndef _SOFTFLOAT_SPECIALIZE_H_
+#define _SOFTFLOAT_SPECIALIZE_H_
+
+#include "softfloat.h"
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#define int16_indefinite ((Bit16s)0x8000)
+#define int32_indefinite ((Bit32s)0x80000000)
+#define int64_indefinite BX_CONST64(0x8000000000000000)
+
+#define uint16_indefinite (0xffff)
+#define uint32_indefinite (0xffffffff)
+#define uint64_indefinite BX_CONST64(0xffffffffffffffff)
+
+/*----------------------------------------------------------------------------
+| Internal canonical NaN format.
+*----------------------------------------------------------------------------*/
+
+typedef struct {
+    int sign;
+    Bit64u hi, lo;
+} commonNaNT;
+
+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated half-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float16 float16_default_nan;
+
+#define float16_fraction extractFloat16Frac
+#define float16_exp extractFloat16Exp
+#define float16_sign extractFloat16Sign
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the half-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16u extractFloat16Frac(float16 a)
+{
+    return a & 0x3FF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the half-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16s extractFloat16Exp(float16 a)
+{
+    return (a>>10) & 0x1F;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the half-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int extractFloat16Sign(float16 a)
+{
+    return a>>15;
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| single-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float16 packFloat16(int zSign, int zExp, Bit16u zSig)
+{
+    return (((Bit16u) zSign)<<15) + (((Bit16u) zExp)<<10) + zSig;
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the half-precision floating-point value `a' is a NaN;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float16_is_nan(float16 a)
+{
+    return (0xF800 < (Bit16u) (a<<1));
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the half-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float16_is_signaling_nan(float16 a)
+{
+    return (((a>>9) & 0x3F) == 0x3E) && (a & 0x1FF);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the half-precision floating-point value `a' is denormal;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float16_is_denormal(float16 a)
+{
+   return (extractFloat16Exp(a) == 0) && (extractFloat16Frac(a) != 0);
+}
+
+/*----------------------------------------------------------------------------
+| Convert float16 denormals to zero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float16 float16_denormal_to_zero(float16 a)
+{
+  if (float16_is_denormal(a)) a &= 0x8000;
+  return a;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the half-precision floating-point NaN
+| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE commonNaNT float16ToCommonNaN(float16 a, struct float_status_t *status)
+{
+    commonNaNT z;
+    if (float16_is_signaling_nan(a)) float_raise(status, float_flag_invalid);
+    z.sign = a>>15;
+    z.lo = 0;
+    z.hi = ((Bit64u) a)<<54;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the half-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float16 commonNaNToFloat16(commonNaNT a)
+{
+    return (((Bit16u) a.sign)<<15) | 0x7E00 | (Bit16u)(a.hi>>54);
+}
+
+#endif
+
+/*----------------------------------------------------------------------------
+| Commonly used single-precision floating point constants
+*----------------------------------------------------------------------------*/
+extern const float32 float32_negative_inf;
+extern const float32 float32_positive_inf;
+extern const float32 float32_negative_zero;
+extern const float32 float32_positive_zero;
+extern const float32 float32_negative_one;
+extern const float32 float32_positive_one;
+extern const float32 float32_max_float;
+extern const float32 float32_min_float;
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated single-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float32 float32_default_nan;
+
+#define float32_fraction extractFloat32Frac
+#define float32_exp extractFloat32Exp
+#define float32_sign extractFloat32Sign
+
+#define FLOAT32_EXP_BIAS 0x7F
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit32u extractFloat32Frac(float32 a)
+{
+    return a & 0x007FFFFF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16s extractFloat32Exp(float32 a)
+{
+    return (a>>23) & 0xFF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int extractFloat32Sign(float32 a)
+{
+    return a>>31;
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| single-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float32 packFloat32(int zSign, Bit16s zExp, Bit32u zSig)
+{
+    return (((Bit32u) zSign)<<31) + (((Bit32u) zExp)<<23) + zSig;
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is a NaN;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float32_is_nan(float32 a)
+{
+    return (0xFF000000 < (Bit32u) (a<<1));
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float32_is_signaling_nan(float32 a)
+{
+    return (((a>>22) & 0x1FF) == 0x1FE) && (a & 0x003FFFFF);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the single-precision floating-point value `a' is denormal;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float32_is_denormal(float32 a)
+{
+   return (extractFloat32Exp(a) == 0) && (extractFloat32Frac(a) != 0);
+}
+
+/*----------------------------------------------------------------------------
+| Convert float32 denormals to zero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float32 float32_denormal_to_zero(float32 a)
+{
+  if (float32_is_denormal(a)) a &= 0x80000000;
+  return a;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point NaN
+| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE commonNaNT float32ToCommonNaN(float32 a, struct float_status_t *status)
+{
+    commonNaNT z;
+    if (float32_is_signaling_nan(a)) float_raise(status, float_flag_invalid);
+    z.sign = a>>31;
+    z.lo = 0;
+    z.hi = ((Bit64u) a)<<41;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the single-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float32 commonNaNToFloat32(commonNaNT a)
+{
+    return (((Bit32u) a.sign)<<31) | 0x7FC00000 | (Bit32u)(a.hi>>41);
+}
+
+/*----------------------------------------------------------------------------
+| Takes two single-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+float32 propagateFloat32NaN(float32 a, float32 b, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes single-precision floating-point NaN `a' and returns the appropriate
+| NaN result.  If `a' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float32 propagateFloat32NaNOne(float32 a, struct float_status_t *status)
+{
+    if (float32_is_signaling_nan(a))
+        float_raise(status, float_flag_invalid);
+
+    return a | 0x00400000;
+}
+
+/*----------------------------------------------------------------------------
+| Commonly used single-precision floating point constants
+*----------------------------------------------------------------------------*/
+extern const float64 float64_negative_inf;
+extern const float64 float64_positive_inf;
+extern const float64 float64_negative_zero;
+extern const float64 float64_positive_zero;
+extern const float64 float64_negative_one;
+extern const float64 float64_positive_one;
+extern const float64 float64_max_float;
+extern const float64 float64_min_float;
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated double-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float64 float64_default_nan;
+
+#define float64_fraction extractFloat64Frac
+#define float64_exp extractFloat64Exp
+#define float64_sign extractFloat64Sign
+
+#define FLOAT64_EXP_BIAS 0x3FF
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit64u extractFloat64Frac(float64 a)
+{
+    return a & BX_CONST64(0x000FFFFFFFFFFFFF);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16s extractFloat64Exp(float64 a)
+{
+    return (Bit16s)(a>>52) & 0x7FF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int extractFloat64Sign(float64 a)
+{
+    return (int)(a>>63);
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| double-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float64 packFloat64(int zSign, Bit16s zExp, Bit64u zSig)
+{
+    return (((Bit64u) zSign)<<63) + (((Bit64u) zExp)<<52) + zSig;
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is a NaN;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float64_is_nan(float64 a)
+{
+    return (BX_CONST64(0xFFE0000000000000) < (Bit64u) (a<<1));
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float64_is_signaling_nan(float64 a)
+{
+    return (((a>>51) & 0xFFF) == 0xFFE) && (a & BX_CONST64(0x0007FFFFFFFFFFFF));
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the double-precision floating-point value `a' is denormal;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float64_is_denormal(float64 a)
+{
+   return (extractFloat64Exp(a) == 0) && (extractFloat64Frac(a) != 0);
+}
+
+/*----------------------------------------------------------------------------
+| Convert float64 denormals to zero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float64 float64_denormal_to_zero(float64 a)
+{
+  if (float64_is_denormal(a)) a &= ((Bit64u)(1) << 63);
+  return a;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the double-precision floating-point NaN
+| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE commonNaNT float64ToCommonNaN(float64 a, struct float_status_t *status)
+{
+    commonNaNT z;
+    if (float64_is_signaling_nan(a)) float_raise(status, float_flag_invalid);
+    z.sign = (int)(a>>63);
+    z.lo = 0;
+    z.hi = a<<12;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the double-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float64 commonNaNToFloat64(commonNaNT a)
+{
+    return (((Bit64u) a.sign)<<63) | BX_CONST64(0x7FF8000000000000) | (a.hi>>12);
+}
+
+/*----------------------------------------------------------------------------
+| Takes two double-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+float64 propagateFloat64NaN(float64 a, float64 b, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes double-precision floating-point NaN `a' and returns the appropriate
+| NaN result.  If `a' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float64 propagateFloat64NaNOne(float64 a, struct float_status_t *status)
+{
+    if (float64_is_signaling_nan(a))
+        float_raise(status, float_flag_invalid);
+
+    return a | BX_CONST64(0x0008000000000000);
+}
+
+#ifdef FLOATX80
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.  The
+| `high' and `low' values hold the most- and least-significant bits,
+| respectively.
+*----------------------------------------------------------------------------*/
+#define floatx80_default_nan_exp 0xFFFF
+#define floatx80_default_nan_fraction BX_CONST64(0xC000000000000000)
+
+#define floatx80_fraction extractFloatx80Frac
+#define floatx80_exp extractFloatx80Exp
+#define floatx80_sign extractFloatx80Sign
+
+#define FLOATX80_EXP_BIAS 0x3FFF
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the extended double-precision floating-point
+| value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit64u extractFloatx80Frac(floatx80 a)
+{
+    return a.fraction;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the extended double-precision floating-point
+| value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit32s extractFloatx80Exp(floatx80 a)
+{
+    return a.exp & 0x7FFF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the extended double-precision floating-point value
+| `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int extractFloatx80Sign(floatx80 a)
+{
+    return a.exp>>15;
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
+| extended double-precision floating-point value, returning the result.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE floatx80 packFloatx80(int zSign, Bit32s zExp, Bit64u zSig)
+{
+    floatx80 z;
+    z.fraction = zSig;
+    z.exp = (zSign << 15) + zExp;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is a
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int floatx80_is_nan(floatx80 a)
+{
+    return ((a.exp & 0x7FFF) == 0x7FFF) && (Bit64s) (a.fraction<<1);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is a
+| signaling NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int floatx80_is_signaling_nan(floatx80 a)
+{
+    Bit64u aLow = a.fraction & ~BX_CONST64(0x4000000000000000);
+    return ((a.exp & 0x7FFF) == 0x7FFF) &&
+            ((Bit64u) (aLow<<1)) && (a.fraction == aLow);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is an
+| unsupported; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int floatx80_is_unsupported(floatx80 a)
+{
+    return ((a.exp & 0x7FFF) && !(a.fraction & BX_CONST64(0x8000000000000000)));
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
+| invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE commonNaNT floatx80ToCommonNaN(floatx80 a, struct float_status_t *status)
+{
+    commonNaNT z;
+    if (floatx80_is_signaling_nan(a)) float_raise(status, float_flag_invalid);
+    z.sign = a.exp >> 15;
+    z.lo = 0;
+    z.hi = a.fraction << 1;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the extended
+| double-precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE floatx80 commonNaNToFloatx80(commonNaNT a)
+{
+    floatx80 z;
+    z.fraction = BX_CONST64(0xC000000000000000) | (a.hi>>1);
+    z.exp = (((Bit16u) a.sign)<<15) | 0x7FFF;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Takes two extended double-precision floating-point values `a' and `b', one
+| of which is a NaN, and returns the appropriate NaN result.  If either `a' or
+| `b' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+floatx80 propagateFloatx80NaN(floatx80 a, floatx80 b, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Takes extended double-precision floating-point  NaN  `a' and returns the
+| appropriate NaN result. If `a' is a signaling NaN, the invalid exception
+| is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE floatx80 propagateFloatx80NaNOne(floatx80 a, struct float_status_t *status)
+{
+    if (floatx80_is_signaling_nan(a))
+        float_raise(status, float_flag_invalid);
+
+    a.fraction |= BX_CONST64(0xC000000000000000);
+
+    return a;
+}
+
+#endif /* FLOATX80 */
+
+#ifdef FLOAT128
+
+#include "softfloat-macros.h"
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated quadruple-precision NaN. The `high' and
+| `low' values hold the most- and least-significant bits, respectively.
+*----------------------------------------------------------------------------*/
+#define float128_default_nan_hi BX_CONST64(0xFFFF800000000000)
+#define float128_default_nan_lo BX_CONST64(0x0000000000000000)
+
+#define float128_exp extractFloat128Exp
+
+/*----------------------------------------------------------------------------
+| Returns the least-significant 64 fraction bits of the quadruple-precision
+| floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit64u extractFloat128Frac1(float128 a)
+{
+    return a.lo;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the most-significant 48 fraction bits of the quadruple-precision
+| floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit64u extractFloat128Frac0(float128 a)
+{
+    return a.hi & BX_CONST64(0x0000FFFFFFFFFFFF);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the quadruple-precision floating-point value
+| `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit32s extractFloat128Exp(float128 a)
+{
+    return ((Bit32s)(a.hi>>48)) & 0x7FFF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the quadruple-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int extractFloat128Sign(float128 a)
+{
+    return (int)(a.hi >> 63);
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', the exponent `zExp', and the significand formed
+| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
+| floating-point value, returning the result.  After being shifted into the
+| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
+| added together to form the most significant 32 bits of the result.  This
+| means that any integer portion of `zSig0' will be added into the exponent.
+| Since a properly normalized significand will have an integer portion equal
+| to 1, the `zExp' input should be 1 less than the desired result exponent
+| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float128 packFloat128Four(int zSign, Bit32s zExp, Bit64u zSig0, Bit64u zSig1)
+{
+    float128 z;
+    z.lo = zSig1;
+    z.hi = (((Bit64u) zSign)<<63) + (((Bit64u) zExp)<<48) + zSig0;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Packs two 64-bit precision integers into into the quadruple-precision
+| floating-point value, returning the result.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float128 packFloat128(Bit64u zHi, Bit64u zLo)
+{
+    float128 z;
+    z.lo = zLo;
+    z.hi = zHi;
+    return z;
+}
+
+#ifdef _MSC_VER
+#define PACK_FLOAT_128(hi,lo) { lo, hi }
+#else
+#define PACK_FLOAT_128(hi,lo) packFloat128(BX_CONST64(hi),BX_CONST64(lo))
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float128_is_nan(float128 a)
+{
+    return (BX_CONST64(0xFFFE000000000000) <= (Bit64u) (a.hi<<1))
+        && (a.lo || (a.hi & BX_CONST64(0x0000FFFFFFFFFFFF)));
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the quadruple-precision floating-point value `a' is a
+| signaling NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float128_is_signaling_nan(float128 a)
+{
+    return (((a.hi>>47) & 0xFFFF) == 0xFFFE)
+        && (a.lo || (a.hi & BX_CONST64(0x00007FFFFFFFFFFF)));
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the quadruple-precision floating-point NaN
+| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE commonNaNT float128ToCommonNaN(float128 a, struct float_status_t *status)
+{
+    commonNaNT z;
+    if (float128_is_signaling_nan(a)) float_raise(status, float_flag_invalid);
+    z.sign = (int)(a.hi>>63);
+    shortShift128Left(a.hi, a.lo, 16, &z.hi, &z.lo);
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the quadruple-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float128 commonNaNToFloat128(commonNaNT a)
+{
+    float128 z;
+    shift128Right(a.hi, a.lo, 16, &z.hi, &z.lo);
+    z.hi |= (((Bit64u) a.sign)<<63) | BX_CONST64(0x7FFF800000000000);
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Takes two quadruple-precision floating-point values `a' and `b', one of
+| which is a NaN, and returns the appropriate NaN result.  If either `a' or
+| `b' is a signaling NaN, the invalid exception is raised.
+*----------------------------------------------------------------------------*/
+
+float128 propagateFloat128NaN(float128 a, float128 b, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated quadruple-precision NaN.
+*----------------------------------------------------------------------------*/
+extern const float128 float128_default_nan;
+
+#endif /* FLOAT128 */
+
+#endif
--- a/src/cpu/softfloat/softfloat.cc
+++ b/src/cpu/softfloat/softfloat.cc
--- a/src/cpu/softfloat/softfloat.h
+++ b/src/cpu/softfloat/softfloat.h
@@ -0,0 +1,488 @@
+/*============================================================================
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#include "config.h"      /* generated by configure script from config.h.in */
+
+#ifndef _SOFTFLOAT_H_
+#define _SOFTFLOAT_H_
+
+#define FLOAT16
+#define FLOATX80
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point types.
+*----------------------------------------------------------------------------*/
+#ifdef FLOAT16
+typedef Bit16u float16;
+#endif
+typedef Bit32u float32;
+typedef Bit64u float64;
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point class.
+*----------------------------------------------------------------------------*/
+typedef enum {
+    float_zero,
+    float_SNaN,
+    float_QNaN,
+    float_negative_inf,
+    float_positive_inf,
+    float_denormal,
+    float_normalized
+} float_class_t;
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point NaN operands handling mode.
+*----------------------------------------------------------------------------*/
+enum float_nan_handling_mode_t {
+    float_larger_significand_nan = 0,   // this mode used by x87 FPU
+    float_first_operand_nan = 1	        // this mode used by SSE
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point rounding mode.
+*----------------------------------------------------------------------------*/
+enum float_round_t {
+    float_round_nearest_even = 0,
+    float_round_down         = 1,
+    float_round_up           = 2,
+    float_round_to_zero      = 3
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point exception flags.
+*----------------------------------------------------------------------------*/
+enum float_exception_flag_t {
+    float_flag_invalid   = 0x01,
+    float_flag_denormal  = 0x02,
+    float_flag_divbyzero = 0x04,
+    float_flag_overflow  = 0x08,
+    float_flag_underflow = 0x10,
+    float_flag_inexact   = 0x20
+};
+
+extern const unsigned float_all_exceptions_mask;
+
+#ifdef FLOATX80
+#define RAISE_SW_C1 0x0200
+#endif
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point ordering relations
+*----------------------------------------------------------------------------*/
+enum {
+    float_relation_less      = -1,
+    float_relation_equal     =  0,
+    float_relation_greater   =  1,
+    float_relation_unordered =  2
+};
+
+/*----------------------------------------------------------------------------
+| Options to indicate which negations to perform in float*_muladd()
+| Using these differs from negating an input or output before calling
+| the muladd function in that this means that a NaN doesn't have its
+| sign bit inverted before it is propagated.
+*----------------------------------------------------------------------------*/
+enum {
+    float_muladd_negate_c       = 1,
+    float_muladd_negate_product = 2,
+    float_muladd_negate_result  = float_muladd_negate_c | float_muladd_negate_product
+};
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point status structure.
+*----------------------------------------------------------------------------*/
+struct float_status_t
+{
+#ifdef FLOATX80
+    int float_rounding_precision;	/* floatx80 only */
+#endif
+    int float_rounding_mode;
+    int float_exception_flags;
+    int float_exception_masks;
+    int float_suppress_exception;
+    int float_nan_handling_mode;	/* flag register */
+    int flush_underflow_to_zero;	/* flag register */
+    int denormals_are_zeros;            /* flag register */
+};
+
+/*----------------------------------------------------------------------------
+| Routine to raise any or all of the software IEC/IEEE floating-point
+| exception flags.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE void float_raise(struct float_status_t *status, int flags)
+{
+    status->float_exception_flags |= flags;
+}
+
+/*----------------------------------------------------------------------------
+| Returns raised IEC/IEEE floating-point exception flags.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int get_exception_flags(const struct float_status_t *status)
+{
+    return status->float_exception_flags & ~status->float_suppress_exception;
+}
+
+/*----------------------------------------------------------------------------
+| Routine to check if any or all of the software IEC/IEEE floating-point
+| exceptions are masked.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float_exception_masked(const struct float_status_t *status, int flag)
+{
+    return status->float_exception_masks & flag;
+}
+
+/*----------------------------------------------------------------------------
+| Returns current floating point rounding mode specified by status word.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int get_float_rounding_mode(const struct float_status_t *status)
+{
+    return status->float_rounding_mode;
+}
+
+/*----------------------------------------------------------------------------
+| Returns current floating point precision (floatx80 only).
+*----------------------------------------------------------------------------*/
+
+#ifdef FLOATX80
+BX_CPP_INLINE int get_float_rounding_precision(const struct float_status_t *status)
+{
+    return status->float_rounding_precision;
+}
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns current floating point NaN operands handling mode specified
+| by status word.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int get_float_nan_handling_mode(const struct float_status_t *status)
+{
+    return status->float_nan_handling_mode;
+}
+
+/*----------------------------------------------------------------------------
+| Raise floating point precision lost up flag (floatx80 only).
+*----------------------------------------------------------------------------*/
+
+#ifdef FLOATX80
+BX_CPP_INLINE void set_float_rounding_up(struct float_status_t *status)
+{
+    status->float_exception_flags |= RAISE_SW_C1;
+}
+#endif
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the <denormals-are-zeros> feature is supported;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int get_denormals_are_zeros(const struct float_status_t *status)
+{
+    return status->denormals_are_zeros;
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the <flush-underflow-to-zero> feature is supported;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int get_flush_underflow_to_zero(const struct float_status_t *status)
+{
+    return status->flush_underflow_to_zero;
+}
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+float32 int32_to_float32(Bit32s, struct float_status_t *status);
+float64 int32_to_float64(Bit32s);
+float32 int64_to_float32(Bit64s, struct float_status_t *status);
+float64 int64_to_float64(Bit64s, struct float_status_t *status);
+
+float32 uint32_to_float32(Bit32u, struct float_status_t *status);
+float64 uint32_to_float64(Bit32u);
+float32 uint64_to_float32(Bit64u, struct float_status_t *status);
+float64 uint64_to_float64(Bit64u, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision conversion routines.
+*----------------------------------------------------------------------------*/
+Bit32s float32_to_int32(float32, struct float_status_t *status);
+Bit32s float32_to_int32_round_to_zero(float32, struct float_status_t *status);
+Bit64s float32_to_int64(float32, struct float_status_t *status);
+Bit64s float32_to_int64_round_to_zero(float32, struct float_status_t *status);
+Bit32u float32_to_uint32(float32, struct float_status_t *status);
+Bit32u float32_to_uint32_round_to_zero(float32, struct float_status_t *status);
+Bit64u float32_to_uint64(float32, struct float_status_t *status);
+Bit64u float32_to_uint64_round_to_zero(float32, struct float_status_t *status);
+float64 float32_to_float64(float32, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE single-precision operations.
+*----------------------------------------------------------------------------*/
+float32 float32_round_to_int(float32, Bit8u scale, struct float_status_t *status);
+float32 float32_add(float32, float32, struct float_status_t *status);
+float32 float32_sub(float32, float32, struct float_status_t *status);
+float32 float32_mul(float32, float32, struct float_status_t *status);
+float32 float32_div(float32, float32, struct float_status_t *status);
+float32 float32_sqrt(float32, struct float_status_t *status);
+float32 float32_frc(float32, struct float_status_t *status);
+float32 float32_muladd(float32, float32, float32, int flags, struct float_status_t *status);
+float32 float32_scalef(float32, float32, struct float_status_t *status);
+int float32_compare(float32, float32, int quiet, struct float_status_t *status);
+
+BX_CPP_INLINE float32 float32_round_to_int_one(float32 a, struct float_status_t *status)
+{
+  return float32_round_to_int(a, 0, status);
+}
+
+BX_CPP_INLINE float32 float32_fmadd(float32 a, float32 b, float32 c, struct float_status_t *status)
+{
+  return float32_muladd(a, b, c, 0, status);
+}
+
+BX_CPP_INLINE float32 float32_fmsub(float32 a, float32 b, float32 c, struct float_status_t *status)
+{
+  return float32_muladd(a, b, c, float_muladd_negate_c, status);
+}
+
+BX_CPP_INLINE float32 float32_fnmadd(float32 a, float32 b, float32 c, struct float_status_t *status)
+{
+  return float32_muladd(a, b, c, float_muladd_negate_product, status);
+}
+
+BX_CPP_INLINE float32 float32_fnmsub(float32 a, float32 b, float32 c, struct float_status_t *status)
+{
+  return float32_muladd(a, b, c, float_muladd_negate_result, status);
+}
+
+BX_CPP_INLINE int float32_compare_two(float32 a, float32 b, struct float_status_t *status)
+{
+  return float32_compare(a, b, 0, status);
+}
+
+BX_CPP_INLINE int float32_compare_quiet(float32 a, float32 b, struct float_status_t *status)
+{
+  return float32_compare(a, b, 1, status);
+}
+
+float_class_t float32_class(float32);
+
+float32 float32_min(float32 a, float32 b, struct float_status_t *status);
+float32 float32_max(float32 a, float32 b, struct float_status_t *status);
+
+float32 float32_minmax(float32 a, float32 b, int is_max, int is_abs, struct float_status_t *status);
+float32 float32_getexp(float32 a, struct float_status_t *status);
+float32 float32_getmant(float32 a, struct float_status_t *status, int sign_ctrl, int interv);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+Bit32s float64_to_int32(float64, struct float_status_t *status);
+Bit32s float64_to_int32_round_to_zero(float64, struct float_status_t *status);
+Bit64s float64_to_int64(float64, struct float_status_t *status);
+Bit64s float64_to_int64_round_to_zero(float64, struct float_status_t *status);
+Bit32u float64_to_uint32(float64, struct float_status_t *status);
+Bit32u float64_to_uint32_round_to_zero(float64, struct float_status_t *status);
+Bit64u float64_to_uint64(float64, struct float_status_t *status);
+Bit64u float64_to_uint64_round_to_zero(float64, struct float_status_t *status);
+float32 float64_to_float32(float64, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE double-precision operations.
+*----------------------------------------------------------------------------*/
+float64 float64_round_to_int(float64, Bit8u scale, struct float_status_t *status);
+float64 float64_add(float64, float64, struct float_status_t *status);
+float64 float64_sub(float64, float64, struct float_status_t *status);
+float64 float64_mul(float64, float64, struct float_status_t *status);
+float64 float64_div(float64, float64, struct float_status_t *status);
+float64 float64_sqrt(float64, struct float_status_t *status);
+float64 float64_frc(float64, struct float_status_t *status);
+float64 float64_muladd(float64, float64, float64, int flags, struct float_status_t *status);
+float64 float64_scalef(float64, float64, struct float_status_t *status);
+int float64_compare(float64, float64, int quiet, struct float_status_t *status);
+
+BX_CPP_INLINE float64 float64_round_to_int_one(float64 a, struct float_status_t *status)
+{
+  return float64_round_to_int(a, 0, status);
+}
+
+BX_CPP_INLINE float64 float64_fmadd(float64 a, float64 b, float64 c, struct float_status_t *status)
+{
+  return float64_muladd(a, b, c, 0, status);
+}
+
+BX_CPP_INLINE float64 float64_fmsub(float64 a, float64 b, float64 c, struct float_status_t *status)
+{
+  return float64_muladd(a, b, c, float_muladd_negate_c, status);
+}
+
+BX_CPP_INLINE float64 float64_fnmadd(float64 a, float64 b, float64 c, struct float_status_t *status)
+{
+  return float64_muladd(a, b, c, float_muladd_negate_product, status);
+}
+
+BX_CPP_INLINE float64 float64_fnmsub(float64 a, float64 b, float64 c, struct float_status_t *status)
+{
+  return float64_muladd(a, b, c, float_muladd_negate_result, status);
+}
+
+BX_CPP_INLINE int float64_compare_two(float64 a, float64 b, struct float_status_t *status)
+{
+  return float64_compare(a, b, 0, status);
+}
+
+BX_CPP_INLINE int float64_compare_quiet(float64 a, float64 b, struct float_status_t *status)
+{
+  return float64_compare(a, b, 1, status);
+}
+
+float_class_t float64_class(float64);
+
+float64 float64_min(float64 a, float64 b, struct float_status_t *status);
+float64 float64_max(float64 a, float64 b, struct float_status_t *status);
+
+float64 float64_minmax(float64 a, float64 b, int is_max, int is_abs, struct float_status_t *status);
+float64 float64_getexp(float64 a, struct float_status_t *status);
+float64 float64_getmant(float64 a, struct float_status_t *status, int sign_ctrl, int interv);
+
+#ifdef FLOAT16
+float32 float16_to_float32(float16, struct float_status_t *status);
+float16 float32_to_float16(float32, struct float_status_t *status);
+
+float_class_t float16_class(float16);
+#endif
+
+#ifdef FLOATX80
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE floating-point types.
+*----------------------------------------------------------------------------*/
+
+#ifdef BX_BIG_ENDIAN
+typedef struct floatx80 {	// leave alignment to compiler
+    Bit16u exp;
+    Bit64u fraction;
+}; floatx80
+#else
+typedef struct floatx80 {
+    Bit64u fraction;
+    Bit16u exp;
+} floatx80;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+floatx80 int32_to_floatx80(Bit32s);
+floatx80 int64_to_floatx80(Bit64s);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision conversion routines.
+*----------------------------------------------------------------------------*/
+floatx80 float32_to_floatx80(float32, struct float_status_t *status);
+floatx80 float64_to_floatx80(float64, struct float_status_t *status);
+
+Bit32s floatx80_to_int32(floatx80, struct float_status_t *status);
+Bit32s floatx80_to_int32_round_to_zero(floatx80, struct float_status_t *status);
+Bit64s floatx80_to_int64(floatx80, struct float_status_t *status);
+Bit64s floatx80_to_int64_round_to_zero(floatx80, struct float_status_t *status);
+
+float32 floatx80_to_float32(floatx80, struct float_status_t *status);
+float64 floatx80_to_float64(floatx80, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision operations.
+*----------------------------------------------------------------------------*/
+floatx80 floatx80_round_to_int(floatx80, struct float_status_t *status);
+floatx80 floatx80_add(floatx80, floatx80, struct float_status_t *status);
+floatx80 floatx80_sub(floatx80, floatx80, struct float_status_t *status);
+floatx80 floatx80_mul(floatx80, floatx80, struct float_status_t *status);
+floatx80 floatx80_div(floatx80, floatx80, struct float_status_t *status);
+floatx80 floatx80_sqrt(floatx80, struct float_status_t *status);
+
+float_class_t floatx80_class(floatx80);
+#ifdef __cplusplus
+}
+#endif
+#endif  /* FLOATX80 */
+
+#ifdef FLOAT128
+
+#ifdef BX_BIG_ENDIAN
+typedef struct float128 {
+    Bit64u hi, lo;
+} float128;
+#else
+typedef struct float128 {
+    Bit64u lo, hi;
+} float128;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE quadruple-precision conversion routines.
+*----------------------------------------------------------------------------*/
+float128 floatx80_to_float128(floatx80 a, struct float_status_t *status);
+floatx80 float128_to_floatx80(float128 a, struct float_status_t *status);
+
+float128 int64_to_float128(Bit64s a);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision operations.
+*----------------------------------------------------------------------------*/
+floatx80 floatx80_128_mul(floatx80 a, float128 b, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE quadruple-precision operations.
+*----------------------------------------------------------------------------*/
+float128 float128_add(float128 a, float128 b, struct float_status_t *status);
+float128 float128_sub(float128 a, float128 b, struct float_status_t *status);
+float128 float128_mul(float128 a, float128 b, struct float_status_t *status);
+float128 float128_div(float128 a, float128 b, struct float_status_t *status);
+#ifdef __cplusplus
+}
+#endif
+#endif  /* FLOAT128 */
+
+#endif
--- a/src/cpu/softfloat/softfloat16.cc
+++ b/src/cpu/softfloat/softfloat16.cc
@@ -0,0 +1,129 @@
+/*============================================================================
+This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Adapted for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#include "softfloat.h"
+
+#ifdef FLOAT16
+
+#include "softfloat-round-pack.h"
+#include "softfloat-specialize.h"
+#include "softfloat-macros.h"
+
+/*----------------------------------------------------------------------------
+| Determine half-precision floating-point number class
+*----------------------------------------------------------------------------*/
+
+float_class_t float16_class(float16 a)
+{
+   Bit16s aExp = extractFloat16Exp(a);
+   Bit16u aSig = extractFloat16Frac(a);
+   int  aSign = extractFloat16Sign(a);
+
+   if(aExp == 0x1F) {
+       if (aSig == 0)
+           return (aSign) ? float_negative_inf : float_positive_inf;
+
+       return (aSig & 0x200) ? float_QNaN : float_SNaN;
+   }
+
+   if(aExp == 0) {
+       if (aSig == 0) return float_zero;
+       return float_denormal;
+   }
+
+   return float_normalized;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the half-precision floating-point value
+| `a' to the single-precision floating-point format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 float16_to_float32(float16 a, struct float_status_t *status)
+{
+    Bit16u aSig = extractFloat16Frac(a);
+    Bit16s aExp = extractFloat16Exp(a);
+    int aSign = extractFloat16Sign(a);
+
+    if (aExp == 0x1F) {
+        if (aSig) return commonNaNToFloat32(float16ToCommonNaN(a, status));
+        return packFloat32(aSign, 0xFF, 0);
+    }
+    if (aExp == 0) {
+        // ignore denormals_are_zeros flag
+        if (aSig == 0) return packFloat32(aSign, 0, 0);
+        float_raise(status, float_flag_denormal);
+        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
+        --aExp;
+    }
+
+    return packFloat32(aSign, aExp + 0x70, ((Bit32u) aSig)<<13);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the single-precision floating-point value
+| `a' to the half-precision floating-point format.  The conversion is
+| performed according to the IEC/IEEE Standard for Binary Floating-Point
+| Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float16 float32_to_float16(float32 a, struct float_status_t *status)
+{
+    Bit32u aSig = extractFloat32Frac(a);
+    Bit16s aExp = extractFloat32Exp(a);
+    int aSign = extractFloat32Sign(a);
+
+    if (aExp == 0xFF) {
+        if (aSig) return commonNaNToFloat16(float32ToCommonNaN(a, status));
+        return packFloat16(aSign, 0x1F, 0);
+    }
+    if (aExp == 0) {
+        if (get_denormals_are_zeros(status)) aSig = 0;
+        if (aSig == 0) return packFloat16(aSign, 0, 0);
+        float_raise(status, float_flag_denormal);
+    }
+
+    aSig = shift32RightJamming(aSig, 9);
+    Bit16u zSig = (Bit16u) aSig;
+    if (aExp || zSig) {
+        zSig |= 0x4000;
+        aExp -= 0x71;
+    }
+
+    return roundAndPackFloat16(aSign, aExp, zSig, status);
+}
+
+#endif
--- a/src/cpu/softfloat/softfloatx80.cc
+++ b/src/cpu/softfloat/softfloatx80.cc
@@ -0,0 +1,367 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#include "softfloatx80.h"
+#include "softfloat-round-pack.h"
+#include "softfloat-macros.h"
+
+const floatx80 Const_QNaN = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+const floatx80 Const_Z    = packFloatx80(0, 0x0000, 0);
+const floatx80 Const_1    = packFloatx80(0, 0x3fff, BX_CONST64(0x8000000000000000));
+const floatx80 Const_L2T  = packFloatx80(0, 0x4000, BX_CONST64(0xd49a784bcd1b8afe));
+const floatx80 Const_L2E  = packFloatx80(0, 0x3fff, BX_CONST64(0xb8aa3b295c17f0bc));
+const floatx80 Const_PI   = packFloatx80(0, 0x4000, BX_CONST64(0xc90fdaa22168c235));
+const floatx80 Const_LG2  = packFloatx80(0, 0x3ffd, BX_CONST64(0x9a209a84fbcff799));
+const floatx80 Const_LN2  = packFloatx80(0, 0x3ffe, BX_CONST64(0xb17217f7d1cf79ac));
+const floatx80 Const_INF  = packFloatx80(0, 0x7fff, BX_CONST64(0x8000000000000000));
+
+/*----------------------------------------------------------------------------
+| Commonly used single-precision floating point constants
+*----------------------------------------------------------------------------*/
+const float32 float32_negative_inf  = 0xff800000;
+const float32 float32_positive_inf  = 0x7f800000;
+const float32 float32_negative_zero = 0x80000000;
+const float32 float32_positive_zero = 0x00000000;
+const float32 float32_negative_one  = 0xbf800000;
+const float32 float32_positive_one  = 0x3f800000;
+const float32 float32_max_float     = 0x7f7fffff;
+const float32 float32_min_float     = 0xff7fffff;
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated single-precision NaN.
+*----------------------------------------------------------------------------*/
+const float32 float32_default_nan   = 0xffc00000;
+
+/*----------------------------------------------------------------------------
+| Commonly used single-precision floating point constants
+*----------------------------------------------------------------------------*/
+const float64 float64_negative_inf  = BX_CONST64(0xfff0000000000000);
+const float64 float64_positive_inf  = BX_CONST64(0x7ff0000000000000);
+const float64 float64_negative_zero = BX_CONST64(0x8000000000000000);
+const float64 float64_positive_zero = BX_CONST64(0x0000000000000000);
+const float64 float64_negative_one  = BX_CONST64(0xbff0000000000000);
+const float64 float64_positive_one  = BX_CONST64(0x3ff0000000000000);
+const float64 float64_max_float     = BX_CONST64(0x7fefffffffffffff);
+const float64 float64_min_float     = BX_CONST64(0xffefffffffffffff);
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated double-precision NaN.
+*----------------------------------------------------------------------------*/
+const float64 float64_default_nan = BX_CONST64(0xFFF8000000000000);
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the 16-bit two's complement integer format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic - which means in particular that the conversion
+| is rounded according to the current rounding mode. If `a' is a NaN or the
+| conversion overflows, the integer indefinite value is returned.
+*----------------------------------------------------------------------------*/
+
+Bit16s floatx80_to_int16(floatx80 a, struct float_status_t *status)
+{
+   if (floatx80_is_unsupported(a)) {
+        float_raise(status, float_flag_invalid);
+        return int16_indefinite;
+   }
+
+   Bit32s v32 = floatx80_to_int32(a, status);
+
+   if ((v32 > 32767) || (v32 < -32768)) {
+        status->float_exception_flags = float_flag_invalid; // throw away other flags
+        return int16_indefinite;
+   }
+
+   return (Bit16s) v32;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the extended double-precision floating-
+| point value `a' to the 16-bit two's complement integer format.  The
+| conversion is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic, except that the conversion is always rounded
+| toward zero.  If `a' is a NaN or the conversion overflows, the integer
+| indefinite value is returned.
+*----------------------------------------------------------------------------*/
+
+Bit16s floatx80_to_int16_round_to_zero(floatx80 a, struct float_status_t *status)
+{
+   if (floatx80_is_unsupported(a)) {
+        float_raise(status, float_flag_invalid);
+        return int16_indefinite;
+   }
+
+   Bit32s v32 = floatx80_to_int32_round_to_zero(a, status);
+
+   if ((v32 > 32767) || (v32 < -32768)) {
+        status->float_exception_flags = float_flag_invalid; // throw away other flags
+        return int16_indefinite;
+   }
+
+   return (Bit16s) v32;
+}
+
+/*----------------------------------------------------------------------------
+| Separate the source extended double-precision floating point value `a'
+| into its exponent and significand, store the significant back to the
+| 'a' and return the exponent. The operation performed is a superset of
+| the IEC/IEEE recommended logb(x) function.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_extract(floatx80 *a, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    Bit64u aSig = extractFloatx80Frac(*a);
+    Bit32s aExp = extractFloatx80Exp(*a);
+    int   aSign = extractFloatx80Sign(*a);
+
+    if (floatx80_is_unsupported(*a))
+    {
+        float_raise(status, float_flag_invalid);
+        *a = floatx80_default_nan;
+        return *a;
+    }
+
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig<<1))
+        {
+            *a = propagateFloatx80NaNOne(*a, status);
+            return *a;
+        }
+        return packFloatx80(0, 0x7FFF, BX_CONST64(0x8000000000000000));
+    }
+    if (aExp == 0)
+    {
+        if (aSig == 0) {
+            float_raise(status, float_flag_divbyzero);
+            *a = packFloatx80(aSign, 0, 0);
+            return packFloatx80(1, 0x7FFF, BX_CONST64(0x8000000000000000));
+        }
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
+    }
+
+    a->exp = (aSign << 15) + 0x3FFF;
+    a->fraction = aSig;
+    return int32_to_floatx80(aExp - 0x3FFF);
+}
+
+/*----------------------------------------------------------------------------
+| Scales extended double-precision floating-point value in operand `a' by
+| value `b'. The function truncates the value in the second operand 'b' to
+| an integral value and adds that value to the exponent of the operand 'a'.
+| The operation performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_scale(floatx80 a, floatx80 b, struct float_status_t *status)
+{
+/*----------------------------------------------------------------------------
+| The pattern for a default generated extended double-precision NaN.
+*----------------------------------------------------------------------------*/
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+
+    Bit32s aExp, bExp;
+    Bit64u aSig, bSig;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
+    {
+        float_raise(status, float_flag_invalid);
+        return floatx80_default_nan;
+    }
+
+    aSig = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    int aSign = extractFloatx80Sign(a);
+    bSig = extractFloatx80Frac(b);
+    bExp = extractFloatx80Exp(b);
+    int bSign = extractFloatx80Sign(b);
+
+    if (aExp == 0x7FFF) {
+        if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
+        {
+            return propagateFloatx80NaN(a, b, status);
+        }
+        if ((bExp == 0x7FFF) && bSign) {
+            float_raise(status, float_flag_invalid);
+            return floatx80_default_nan;
+        }
+        if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
+        return a;
+    }
+    if (bExp == 0x7FFF) {
+        if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
+        if ((aExp | aSig) == 0) {
+            if (! bSign) {
+                float_raise(status, float_flag_invalid);
+                return floatx80_default_nan;
+            }
+            return a;
+        }
+        if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
+        if (bSign) return packFloatx80(aSign, 0, 0);
+        return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
+    }
+    if (aExp == 0) {
+        if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
+        if (aSig == 0) return a;
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
+        if (bExp < 0x3FFF)
+            return normalizeRoundAndPackFloatx80(80, aSign, aExp, aSig, 0, status);
+    }
+    if (bExp == 0) {
+        if (bSig == 0) return a;
+        float_raise(status, float_flag_denormal);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+
+    if (bExp > 0x400E) {
+        /* generate appropriate overflow/underflow */
+        return roundAndPackFloatx80(80, aSign,
+                          bSign ? -0x3FFF : 0x7FFF, aSig, 0, status);
+    }
+
+    if (bExp < 0x3FFF) return a;
+
+    int shiftCount = 0x403E - bExp;
+    bSig >>= shiftCount;
+    Bit32s scale = (Bit32s) bSig;
+    if (bSign) scale = -scale; /* -32768..32767 */
+    return
+        roundAndPackFloatx80(80, aSign, aExp+scale, aSig, 0, status);
+}
+
+/*----------------------------------------------------------------------------
+| Determine extended-precision floating-point number class.
+*----------------------------------------------------------------------------*/
+
+float_class_t floatx80_class(floatx80 a)
+{
+   Bit32s aExp = extractFloatx80Exp(a);
+   Bit64u aSig = extractFloatx80Frac(a);
+
+   if(aExp == 0) {
+       if (aSig == 0)
+           return float_zero;
+
+       /* denormal or pseudo-denormal */
+       return float_denormal;
+   }
+
+   /* valid numbers have the MS bit set */
+   if (!(aSig & BX_CONST64(0x8000000000000000)))
+       return float_SNaN; /* report unsupported as SNaNs */
+
+   if(aExp == 0x7fff) {
+       int aSign = extractFloatx80Sign(a);
+
+       if (((Bit64u) (aSig<< 1)) == 0)
+           return (aSign) ? float_negative_inf : float_positive_inf;
+
+       return (aSig & BX_CONST64(0x4000000000000000)) ? float_QNaN : float_SNaN;
+   }
+
+   return float_normalized;
+}
+
+/*----------------------------------------------------------------------------
+| Compare  between  two extended precision  floating  point  numbers. Returns
+| 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
+| the    value    'a'   is   less   than   the   corresponding   value   `b',
+| 'float_relation_greater' if the value 'a' is greater than the corresponding
+| value `b', or 'float_relation_unordered' otherwise.
+*----------------------------------------------------------------------------*/
+
+int floatx80_compare(floatx80 a, floatx80 b, int quiet, struct float_status_t *status)
+{
+    float_class_t aClass = floatx80_class(a);
+    float_class_t bClass = floatx80_class(b);
+
+    if (aClass == float_SNaN || bClass == float_SNaN)
+    {
+        /* unsupported reported as SNaN */
+        float_raise(status, float_flag_invalid);
+        return float_relation_unordered;
+    }
+
+    if (aClass == float_QNaN || bClass == float_QNaN) {
+        if (! quiet) float_raise(status, float_flag_invalid);
+        return float_relation_unordered;
+    }
+
+    if (aClass == float_denormal || bClass == float_denormal) {
+        float_raise(status, float_flag_denormal);
+    }
+
+    int aSign = extractFloatx80Sign(a);
+    int bSign = extractFloatx80Sign(b);
+
+    if (aClass == float_zero) {
+        if (bClass == float_zero) return float_relation_equal;
+        return bSign ? float_relation_greater : float_relation_less;
+    }
+
+    if (bClass == float_zero || aSign != bSign) {
+        return aSign ? float_relation_less : float_relation_greater;
+    }
+
+    Bit64u aSig = extractFloatx80Frac(a);
+    Bit32s aExp = extractFloatx80Exp(a);
+    Bit64u bSig = extractFloatx80Frac(b);
+    Bit32s bExp = extractFloatx80Exp(b);
+
+    if (aClass == float_denormal)
+        normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
+
+    if (bClass == float_denormal)
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+
+    if (aExp == bExp && aSig == bSig)
+        return float_relation_equal;
+
+    int less_than =
+        aSign ? ((bExp < aExp) || ((bExp == aExp) && (bSig < aSig)))
+              : ((aExp < bExp) || ((aExp == bExp) && (aSig < bSig)));
+
+    if (less_than) return float_relation_less;
+    return float_relation_greater;
+}
+
+
+int floatx80_compare_two(floatx80 a, floatx80 b, struct float_status_t *status)
+{
+    return floatx80_compare(a, b, 0, status);
+}
+
+int floatx80_compare_quiet(floatx80 a, floatx80 b, struct float_status_t *status)
+{
+    return floatx80_compare(a, b, 1, status);
+}
--- a/src/cpu/softfloat/softfloatx80.h
+++ b/src/cpu/softfloat/softfloatx80.h
@@ -0,0 +1,113 @@
+/*============================================================================
+This source file is an extension to the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
+floating point emulation.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
+been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
+RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
+AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
+COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
+EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+=============================================================================*/
+
+/*============================================================================
+ * Written for Bochs (x86 achitecture simulator) by
+ *            Stanislav Shwartsman [sshwarts at sourceforge net]
+ * ==========================================================================*/
+
+#ifndef _SOFTFLOATX80_EXTENSIONS_H_
+#define _SOFTFLOATX80_EXTENSIONS_H_
+
+#include "softfloat.h"
+#include "softfloat-specialize.h"
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE integer-to-floating-point conversion routines.
+*----------------------------------------------------------------------------*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+Bit16s floatx80_to_int16(floatx80, struct float_status_t *status);
+Bit16s floatx80_to_int16_round_to_zero(floatx80, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision operations.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_extract(floatx80 *a, struct float_status_t *status);
+floatx80 floatx80_scale(floatx80 a, floatx80 b, struct float_status_t *status);
+int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, Bit64u *q, struct float_status_t *status);
+int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, Bit64u *q, struct float_status_t *status);
+floatx80 f2xm1(floatx80 a, struct float_status_t *status);
+floatx80 fyl2x(floatx80 a, floatx80 b, struct float_status_t *status);
+floatx80 fyl2xp1(floatx80 a, floatx80 b, struct float_status_t *status);
+floatx80 fpatan(floatx80 a, floatx80 b, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision trigonometric functions.
+*----------------------------------------------------------------------------*/
+
+int fsincos(floatx80 a, floatx80 *sin_a, floatx80 *cos_a, struct float_status_t *status);
+int fsin(floatx80 *a, struct float_status_t *status);
+int fcos(floatx80 *a, struct float_status_t *status);
+int ftan(floatx80 *a, struct float_status_t *status);
+
+/*----------------------------------------------------------------------------
+| Software IEC/IEEE extended double-precision compare.
+*----------------------------------------------------------------------------*/
+
+int floatx80_compare(floatx80, floatx80, int quiet, struct float_status_t *status);
+int floatx80_compare_two(floatx80 a, floatx80 b, struct float_status_t *status);
+int floatx80_compare_quiet(floatx80 a, floatx80 b, struct float_status_t *status);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+| Calculates the absolute value of the extended double-precision floating-point
+| value `a'.  The operation is performed according to the IEC/IEEE Standard
+| for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE floatx80 floatx80_abs(floatx80 reg)
+{
+    reg.exp &= 0x7FFF;
+    return reg;
+}
+
+/*-----------------------------------------------------------------------------
+| Changes the sign of the extended double-precision floating-point value 'a'.
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE floatx80 floatx80_chs(floatx80 reg)
+{
+    reg.exp ^= 0x8000;
+    return reg;
+}
+
+/*-----------------------------------------------------------------------------
+| Commonly used extended double-precision floating-point constants.
+*----------------------------------------------------------------------------*/
+
+extern const floatx80 Const_Z;
+extern const floatx80 Const_1;
+extern const floatx80 Const_L2T;
+extern const floatx80 Const_L2E;
+extern const floatx80 Const_PI;
+extern const floatx80 Const_LG2;
+extern const floatx80 Const_LN2;
+extern const floatx80 Const_INF;
+#endif
--- a/src/cpu/softfloat/x87_ops_arith.h
+++ b/src/cpu/softfloat/x87_ops_arith.h
@@ -0,0 +1,750 @@
+#define sf_FPU(name, optype, a_size, load_var, rw, use_var, is_nan, cycle_postfix)                                                                         \
+    static int sf_FADD##name##_a##a_size(uint32_t fetchdat)                                                                                         \
+    {                                                                                                                                              \
+        floatx80 a, result; \
+        struct float_status_t status; \
+        optype temp; \
+        FP_ENTER();                                                                                                                                \
+        FPU_check_pending_exceptions(); \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_stack_underflow(fetchdat, 0, 0); \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (!is_nan) \
+            result = floatx80_add(a, use_var, &status); \
+        \
+        if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) \
+            FPU_save_regi(result, 0); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd##cycle_postfix) : ((x87_timings.fadd##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd##cycle_postfix) : ((x87_concurrency.fadd##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }                                                                                                                                              \
+    static int sf_FDIV##name##_a##a_size(uint32_t fetchdat)                                                                                         \
+    {                                                                                                                                              \
+        floatx80 a, result; \
+        struct float_status_t status; \
+        optype temp;                                                                                                                                  \
+        FP_ENTER();                                                                                                                                \
+        FPU_check_pending_exceptions(); \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_stack_underflow(fetchdat, 0, 0); \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (!is_nan) { \
+            result = floatx80_div(a, use_var, &status);        \
+        } \
+        if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) \
+            FPU_save_regi(result, 0); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv##cycle_postfix) : ((x87_timings.fdiv##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd##cycle_postfix) : ((x87_concurrency.fadd##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }                                                                                                                                              \
+    static int sf_FDIVR##name##_a##a_size(uint32_t fetchdat)                                                                                        \
+    {                                                                                                                                              \
+        floatx80 a, result; \
+        struct float_status_t status; \
+        optype temp; \
+        FP_ENTER();                                                                                                                                \
+        FPU_check_pending_exceptions(); \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_stack_underflow(fetchdat, 0, 0); \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (!is_nan) { \
+            result = floatx80_div(use_var, a, &status);        \
+        } \
+        if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) \
+            FPU_save_regi(result, 0); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv##cycle_postfix) : ((x87_timings.fdiv##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fdiv##cycle_postfix) : ((x87_concurrency.fdiv##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }                                                                                                                                              \
+    static int sf_FMUL##name##_a##a_size(uint32_t fetchdat)                                                                                         \
+    {                                                                                                                                              \
+        floatx80 a, result; \
+        struct float_status_t status; \
+        optype temp;                                                                                                                                  \
+        FP_ENTER();                                                                                                                                \
+        FPU_check_pending_exceptions(); \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_stack_underflow(fetchdat, 0, 0); \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (!is_nan) { \
+            result = floatx80_mul(a, use_var, &status); \
+        } \
+        if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) \
+            FPU_save_regi(result, 0); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fmul##cycle_postfix) : ((x87_timings.fmul##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fmul##cycle_postfix) : ((x87_concurrency.fmul##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }                                                                                                                                              \
+    static int sf_FSUB##name##_a##a_size(uint32_t fetchdat)                                                                                         \
+    {                                                                                                                                              \
+        floatx80 a, result; \
+        struct float_status_t status; \
+        optype temp;                                                                                                                                  \
+        FP_ENTER();                                                                                                                                \
+        FPU_check_pending_exceptions(); \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_stack_underflow(fetchdat, 0, 0); \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (!is_nan) \
+            result = floatx80_sub(a, use_var, &status); \
+        \
+        if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) \
+            FPU_save_regi(result, 0); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd##cycle_postfix) : ((x87_timings.fadd##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd##cycle_postfix) : ((x87_concurrency.fadd##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }                                                                                                                                              \
+    static int sf_FSUBR##name##_a##a_size(uint32_t fetchdat)                                                                                        \
+    {                                                                                                                                              \
+        floatx80 a, result; \
+        struct float_status_t status; \
+        optype temp;                                                                                                                                  \
+        FP_ENTER();                                                                                                                                \
+        FPU_check_pending_exceptions(); \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_stack_underflow(fetchdat, 0, 0); \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (!is_nan) \
+            result = floatx80_sub(use_var, a, &status); \
+        \
+        if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) \
+            FPU_save_regi(result, 0); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd##cycle_postfix) : ((x87_timings.fadd##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd##cycle_postfix) : ((x87_concurrency.fadd##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }
+
+// clang-format off
+sf_FPU(s, float32, 16, temp, geteal(), float32_to_floatx80(temp, &status), FPU_handle_NaN32(a, temp, &result, &status), _32)
+#ifndef FPU_8087
+sf_FPU(s, float32, 32, temp, geteal(), float32_to_floatx80(temp, &status), FPU_handle_NaN32(a, temp, &result, &status), _32)
+#endif
+sf_FPU(d, float64, 16, temp, geteaq(), float64_to_floatx80(temp, &status), FPU_handle_NaN64(a, temp, &result, &status), _64)
+#ifndef FPU_8087
+sf_FPU(d, float64, 32, temp, geteaq(), float64_to_floatx80(temp, &status), FPU_handle_NaN64(a, temp, &result, &status), _64)
+#endif
+
+sf_FPU(iw, uint16_t, 16, temp, geteaw(), int32_to_floatx80((int16_t)temp), 0, _i16)
+#ifndef FPU_8087
+sf_FPU(iw, uint16_t, 32, temp, geteaw(), int32_to_floatx80((int16_t)temp), 0, _i16)
+#endif
+sf_FPU(il, uint32_t, 16, temp, geteal(), int32_to_floatx80((int32_t)temp), 0, _i32)
+#ifndef FPU_8087
+sf_FPU(il, uint32_t, 32, temp, geteal(), int32_to_floatx80((int32_t)temp), 0, _i32)
+#endif
+// clang-format on
+
+static int
+sf_FADD_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_add(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+static int
+sf_FADD_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_add(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, fetchdat & 7);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FADDP_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_add(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FDIV_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_div(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv) : (x87_timings.fdiv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fdiv) : (x87_concurrency.fdiv * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FDIV_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_div(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, fetchdat & 7);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv) : (x87_timings.fdiv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fdiv) : (x87_concurrency.fdiv * cpu_multi));
+    return 0;
+}
+static int
+sf_FDIVP_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_div(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv) : (x87_timings.fdiv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fdiv) : (x87_concurrency.fdiv * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FDIVR_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_div(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv) : (x87_timings.fdiv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fdiv) : (x87_concurrency.fdiv * cpu_multi));
+    return 0;
+}
+static int
+sf_FDIVR_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_div(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, fetchdat & 7);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv) : (x87_timings.fdiv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fdiv) : (x87_concurrency.fdiv * cpu_multi));
+    return 0;
+}
+static int
+sf_FDIVRP_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_div(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fdiv) : (x87_timings.fdiv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fdiv) : (x87_concurrency.fdiv * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FMUL_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_mul(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fmul) : (x87_timings.fmul * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fmul) : (x87_concurrency.fmul * cpu_multi));
+    return 0;
+}
+static int
+sf_FMUL_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_mul(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fmul) : (x87_timings.fmul * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fmul) : (x87_concurrency.fmul * cpu_multi));
+    return 0;
+}
+static int
+sf_FMULP_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_mul(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fmul) : (x87_timings.fmul * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fmul) : (x87_concurrency.fmul * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FSUB_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_sub(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+static int
+sf_FSUB_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_sub(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+static int
+sf_FSUBP_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_sub(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FSUBR_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(fetchdat & 7);
+    b = FPU_read_regi(0);
+    result = floatx80_sub(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+static int
+sf_FSUBR_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_sub(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+static int
+sf_FSUBRP_sti_st0(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_stack_underflow(fetchdat, fetchdat & 7, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    result = floatx80_sub(a, b, &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, fetchdat & 7);
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fadd) : (x87_timings.fadd * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fadd) : (x87_concurrency.fadd * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FSQRT(uint32_t fetchdat)
+{
+    floatx80 result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    result = floatx80_sqrt(FPU_read_regi(0), &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fsqrt) : (x87_timings.fsqrt * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fsqrt) : (x87_concurrency.fsqrt * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FRNDINT(uint32_t fetchdat)
+{
+    floatx80 result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    result = floatx80_round_to_int(FPU_read_regi(0), &status);
+
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.frndint) : (x87_timings.frndint * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.frndint) : (x87_concurrency.frndint * cpu_multi));
+    return 0;
+}
--- a/src/cpu/softfloat/x87_ops_compare.h
+++ b/src/cpu/softfloat/x87_ops_compare.h
@@ -0,0 +1,489 @@
+#define cmp_FPU(name, optype, a_size, load_var, rw, use_var, is_nan, cycle_postfix)                                                                         \
+    static int sf_FCOM##name##_a##a_size(uint32_t fetchdat)                                                                                         \
+    {                                                                                                                                              \
+        floatx80 a; \
+        int rc; \
+        struct float_status_t status; \
+        optype temp; \
+        FP_ENTER();                                                                                                                                \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0); \
+            setcc(C0 | C2 | C3); \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (is_nan) { \
+            rc = float_relation_unordered; \
+            float_raise(&status, float_flag_invalid); \
+        } else { \
+            rc = floatx80_compare_two(a, use_var, &status); \
+        } \
+        setcc(FPU_status_word_flags_fpu_compare(rc)); \
+        FPU_exception(fetchdat, status.float_exception_flags, 0); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fcom##cycle_postfix) : ((x87_timings.fcom##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fcom##cycle_postfix) : ((x87_concurrency.fcom##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }                                                                                                                                              \
+    static int sf_FCOMP##name##_a##a_size(uint32_t fetchdat)                                                                                        \
+    {                                                                                                                                              \
+        floatx80 a; \
+        int rc; \
+        struct float_status_t status; \
+        optype temp;                                                                                                                                  \
+        FP_ENTER();                                                                                                                                \
+        fetch_ea_##a_size(fetchdat);                                                                                                               \
+        SEG_CHECK_READ(cpu_state.ea_seg);                                                                                                          \
+        load_var = rw; \
+        if (cpu_state.abrt) \
+            return 1;\
+        clear_C1(); \
+        if (IS_TAG_EMPTY(0)) { \
+            FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0); \
+            setcc(C0 | C2 | C3); \
+            if (is_IA_masked()) \
+                FPU_pop(); \
+            \
+            goto next_ins; \
+        } \
+        status = i387cw_to_softfloat_status_word(i387_get_control_word()); \
+        a = FPU_read_regi(0); \
+        if (is_nan) { \
+            rc = float_relation_unordered; \
+            float_raise(&status, float_flag_invalid); \
+        } else { \
+            rc = floatx80_compare_two(a, use_var, &status); \
+        } \
+        setcc(FPU_status_word_flags_fpu_compare(rc)); \
+        if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) \
+            FPU_pop(); \
+        \
+next_ins: \
+        CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fcom##cycle_postfix) : ((x87_timings.fcom##cycle_postfix) * cpu_multi));           \
+        CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fcom##cycle_postfix) : ((x87_concurrency.fcom##cycle_postfix) * cpu_multi)); \
+        return 0;                                                                                                                                  \
+    }                                                                                                                                              \
+
+// clang-format off
+cmp_FPU(s, float32, 16, temp, geteal(), float32_to_floatx80(temp, &status), floatx80_is_nan(a) || floatx80_is_unsupported(a) || float32_is_nan(temp), _32)
+#ifndef FPU_8087
+cmp_FPU(s, float32, 32, temp, geteal(), float32_to_floatx80(temp, &status), floatx80_is_nan(a) || floatx80_is_unsupported(a) || float32_is_nan(temp), _32)
+#endif
+cmp_FPU(d, float64, 16, temp, geteaq(), float64_to_floatx80(temp, &status), floatx80_is_nan(a) || floatx80_is_unsupported(a) || float64_is_nan(temp), _64)
+#ifndef FPU_8087
+cmp_FPU(d, float64, 32, temp, geteaq(), float64_to_floatx80(temp, &status), floatx80_is_nan(a) || floatx80_is_unsupported(a) || float64_is_nan(temp), _64)
+#endif
+
+cmp_FPU(iw, int16_t, 16, temp, (int16_t)geteaw(), int32_to_floatx80((int32_t)temp), 0, _i16)
+#ifndef FPU_8087
+cmp_FPU(iw, int16_t, 32, temp, (int16_t)geteaw(), int32_to_floatx80((int32_t)temp), 0, _i16)
+#endif
+cmp_FPU(il, int32_t, 16, temp, (int32_t)geteal(), int32_to_floatx80(temp), 0, _i32)
+#ifndef FPU_8087
+cmp_FPU(il, int32_t, 32, temp, (int32_t)geteal(), int32_to_floatx80(temp), 0, _i32)
+#endif
+// clang-format on
+
+static int
+sf_FCOM_sti(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        setcc(C0 | C2 | C3);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_two(a, b, &status);
+    setcc(FPU_status_word_flags_fpu_compare(rc));
+    FPU_exception(fetchdat, status.float_exception_flags, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fcom) : (x87_timings.fcom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fcom) : (x87_concurrency.fcom * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FCOMP_sti(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        setcc(C0 | C2 | C3);
+        if (is_IA_masked()) {
+            FPU_pop();
+        }
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_two(a, b, &status);
+    setcc(FPU_status_word_flags_fpu_compare(rc));
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fcom) : (x87_timings.fcom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fcom) : (x87_concurrency.fcom * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FCOMPP(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        setcc(C0 | C2 | C3);
+        if (is_IA_masked()) {
+            FPU_pop();
+            FPU_pop();
+        }
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(1);
+    rc = floatx80_compare_two(a, b, &status);
+    setcc(FPU_status_word_flags_fpu_compare(rc));
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_pop();
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fcom) : (x87_timings.fcom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fcom) : (x87_concurrency.fcom * cpu_multi));
+    return 0;
+}
+
+#ifndef FPU_8087
+static int
+sf_FUCOMPP(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        setcc(C0 | C2 | C3);
+        if (is_IA_masked()) {
+            FPU_pop();
+            FPU_pop();
+        }
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(1);
+    rc = floatx80_compare_quiet(a, b, &status);
+    setcc(FPU_status_word_flags_fpu_compare(rc));
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_pop();
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fucom) : (x87_timings.fucom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fucom) : (x87_concurrency.fucom * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FCOMI_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    flags_rebuild();
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        cpu_state.flags |= (Z_FLAG | P_FLAG | C_FLAG);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_two(a, b, &status);
+    FPU_write_eflags_fpu_compare(rc);
+    FPU_exception(fetchdat, status.float_exception_flags, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fcom) : (x87_timings.fcom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fcom) : (x87_concurrency.fcom * cpu_multi));
+    return 0;
+}
+static int
+sf_FCOMIP_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    flags_rebuild();
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        cpu_state.flags |= (Z_FLAG | P_FLAG | C_FLAG);
+        if (is_IA_masked()) {
+            FPU_pop();
+        }
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_two(a, b, &status);
+    FPU_write_eflags_fpu_compare(rc);
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fcom) : (x87_timings.fcom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fcom) : (x87_concurrency.fcom * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FUCOM_sti(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        setcc(C0 | C2 | C3);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_quiet(a, b, &status);
+    setcc(FPU_status_word_flags_fpu_compare(rc));
+    FPU_exception(fetchdat, status.float_exception_flags, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fucom) : (x87_timings.fucom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fucom) : (x87_concurrency.fucom * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FUCOMP_sti(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        setcc(C0 | C2 | C3);
+        if (is_IA_masked())
+            FPU_pop();
+
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_quiet(a, b, &status);
+    setcc(FPU_status_word_flags_fpu_compare(rc));
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_pop();
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fucom) : (x87_timings.fucom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fucom) : (x87_concurrency.fucom * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FUCOMI_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    flags_rebuild();
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        cpu_state.flags |= (Z_FLAG | P_FLAG | C_FLAG);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_quiet(a, b, &status);
+    FPU_write_eflags_fpu_compare(rc);
+    FPU_exception(fetchdat, status.float_exception_flags, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fucom) : (x87_timings.fucom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fucom) : (x87_concurrency.fucom * cpu_multi));
+    return 0;
+}
+static int
+sf_FUCOMIP_st0_stj(uint32_t fetchdat)
+{
+    floatx80 a, b;
+    struct float_status_t status;
+    int rc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    flags_rebuild();
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(fetchdat & 7)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        cpu_state.flags |= (Z_FLAG | P_FLAG | C_FLAG);
+        if (is_IA_masked())
+            FPU_pop();
+
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(fetchdat & 7);
+    rc = floatx80_compare_quiet(a, b, &status);
+    FPU_write_eflags_fpu_compare(rc);
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_pop();
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fucom) : (x87_timings.fucom * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fucom) : (x87_concurrency.fucom * cpu_multi));
+    return 0;
+}
+#endif
+
+static int
+sf_FTST(uint32_t fetchdat)
+{
+    int rc;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        setcc(C0 | C2 | C3);
+    } else {
+        status = i387cw_to_softfloat_status_word(i387_get_control_word());
+        rc = floatx80_compare_two(FPU_read_regi(0), Const_Z, &status);
+        setcc(FPU_status_word_flags_fpu_compare(rc));
+        FPU_exception(fetchdat, status.float_exception_flags, 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.ftst) : (x87_timings.ftst * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.ftst) : (x87_concurrency.ftst * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FXAM(uint32_t fetchdat)
+{
+    floatx80 reg;
+    int sign;
+    float_class_t aClass;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    reg = FPU_read_regi(0);
+    sign = floatx80_sign(reg);
+  /*
+   * Examine the contents of the ST(0) register and sets the condition
+   * code flags C0, C2 and C3 in the FPU status word to indicate the
+   * class of value or number in the register.
+   */
+    if (IS_TAG_EMPTY(0)) {
+        setcc(C3 | C1 | C0);
+    } else {
+        aClass = floatx80_class(reg);
+        switch (aClass) {
+            case float_zero:
+                setcc(C3 | C1);
+                break;
+            case float_SNaN:
+            case float_QNaN:
+                // unsupported handled as NaNs
+                if (floatx80_is_unsupported(reg)) {
+                    setcc(C1);
+                } else {
+                    setcc(C1 | C0);
+                }
+                break;
+            case float_negative_inf:
+            case float_positive_inf:
+                setcc(C2 | C1 | C0);
+                break;
+            case float_denormal:
+                setcc(C3 | C2 | C1);
+                break;
+            case float_normalized:
+                setcc(C2 | C1);
+                break;
+        }
+    }
+  /*
+   * The C1 flag is set to the sign of the value in ST(0), regardless
+   * of whether the register is empty or full.
+   */
+    if (!sign)
+        clear_C1();
+
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fxam) : (x87_timings.fxam * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fxam) : (x87_concurrency.fxam * cpu_multi));
+    return 0;
+}
--- a/src/cpu/softfloat/x87_ops_const.h
+++ b/src/cpu/softfloat/x87_ops_const.h
@@ -0,0 +1,131 @@
+/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP
+   (and not one of RC_RND or RC_UP).
+   */
+#define DOWN_OR_CHOP()  (fpu_state.cwd & FPU_CW_RC & FPU_RC_DOWN)
+
+static __inline floatx80
+FPU_round_const(const floatx80 a, int adj)
+{
+  floatx80 result = a;
+  result.fraction += adj;
+  return result;
+}
+
+static int
+sf_FLDL2T(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (!IS_TAG_EMPTY(-1))
+        FPU_stack_overflow(fetchdat);
+    else {
+        FPU_push();
+        FPU_save_regi(FPU_round_const(Const_L2T, (fpu_state.cwd & FPU_CW_RC) == X87_ROUNDING_UP), 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fld_const) : (x87_timings.fld_const * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fld_const) : (x87_concurrency.fld_const * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FLDL2E(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (!IS_TAG_EMPTY(-1))
+        FPU_stack_overflow(fetchdat);
+    else {
+        FPU_push();
+        FPU_save_regi(FPU_round_const(Const_L2E, DOWN_OR_CHOP() ? -1 : 0), 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fld_const) : (x87_timings.fld_const * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fld_const) : (x87_concurrency.fld_const * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FLDPI(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (!IS_TAG_EMPTY(-1))
+        FPU_stack_overflow(fetchdat);
+    else {
+        FPU_push();
+        FPU_save_regi(FPU_round_const(Const_PI, DOWN_OR_CHOP() ? -1 : 0), 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fld_const) : (x87_timings.fld_const * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fld_const) : (x87_concurrency.fld_const * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FLDEG2(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (!IS_TAG_EMPTY(-1))
+        FPU_stack_overflow(fetchdat);
+    else {
+        FPU_push();
+        FPU_save_regi(FPU_round_const(Const_LG2, DOWN_OR_CHOP() ? -1 : 0), 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fld_const) : (x87_timings.fld_const * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fld_const) : (x87_concurrency.fld_const * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FLDLN2(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (!IS_TAG_EMPTY(-1))
+        FPU_stack_overflow(fetchdat);
+    else {
+        FPU_push();
+        FPU_save_regi(FPU_round_const(Const_LN2, DOWN_OR_CHOP() ? -1 : 0), 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fld_const) : (x87_timings.fld_const * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fld_const) : (x87_concurrency.fld_const * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FLD1(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (!IS_TAG_EMPTY(-1))
+        FPU_stack_overflow(fetchdat);
+    else {
+        FPU_push();
+        FPU_save_regi(Const_1, 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fld_z1) : (x87_timings.fld_z1 * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fld_z1) : (x87_concurrency.fld_z1 * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FLDZ(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (!IS_TAG_EMPTY(-1))
+        FPU_stack_overflow(fetchdat);
+    else {
+        FPU_push();
+        FPU_save_regi(Const_Z, 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fld_z1) : (x87_timings.fld_z1 * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fld_z1) : (x87_concurrency.fld_z1 * cpu_multi));
+    return 0;
+}
--- a/src/cpu/softfloat/x87_ops_load_store.h
+++ b/src/cpu/softfloat/x87_ops_load_store.h
--- a/src/cpu/softfloat/x87_ops_misc.h
+++ b/src/cpu/softfloat/x87_ops_misc.h
@@ -0,0 +1,134 @@
+static int
+sf_FXCH_sti(uint32_t fetchdat)
+{
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+    floatx80 st0_reg, sti_reg;
+    int st0_tag, sti_tag;
+
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    st0_tag = FPU_gettagi(0);
+    sti_tag = FPU_gettagi(fetchdat & 7);
+    st0_reg = FPU_read_regi(0);
+    sti_reg = FPU_read_regi(fetchdat & 7);
+
+    clear_C1();
+    if ((st0_tag == X87_TAG_EMPTY) || (sti_tag == X87_TAG_EMPTY)) {
+        FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        if (is_IA_masked()) {
+            /* Masked response */
+            if (st0_tag == X87_TAG_EMPTY)
+                st0_reg = floatx80_default_nan;
+            if (sti_tag == X87_TAG_EMPTY)
+                sti_reg = floatx80_default_nan;
+        } else
+            goto next_ins;
+    }
+    FPU_save_regi(st0_reg, fetchdat & 7);
+    FPU_save_regi(sti_reg, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fxch) : (x87_timings.fxch * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fxch) : (x87_concurrency.fxch * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FCHS(uint32_t fetchdat)
+{
+    floatx80 st0_reg;
+    floatx80 result;
+
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    if (IS_TAG_EMPTY(0))
+        FPU_stack_underflow(fetchdat, 0, 0);
+    else {
+        clear_C1();
+        st0_reg = FPU_read_regi(0);
+        result = floatx80_chs(st0_reg);
+        FPU_save_regi(result, 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fchs) : (x87_timings.fchs * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fchs) : (x87_concurrency.fchs * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FABS(uint32_t fetchdat)
+{
+    floatx80 st0_reg;
+    floatx80 result;
+
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    if (IS_TAG_EMPTY(0))
+        FPU_stack_underflow(fetchdat, 0, 0);
+    else {
+        clear_C1();
+        st0_reg = FPU_read_regi(0);
+        result = floatx80_abs(st0_reg);
+        FPU_save_regi(result, 0);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fabs) : (x87_timings.fabs * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fabs) : (x87_concurrency.fabs * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FDECSTP(uint32_t fetchdat)
+{
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    clear_C1();
+    fpu_state.tos = (fpu_state.tos - 1) & 7;
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fincdecstp) : (x87_timings.fincdecstp * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fincdecstp) : (x87_concurrency.fincdecstp * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FINCSTP(uint32_t fetchdat)
+{
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    clear_C1();
+    fpu_state.tos = (fpu_state.tos + 1) & 7;
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fincdecstp) : (x87_timings.fincdecstp * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fincdecstp) : (x87_concurrency.fincdecstp * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FFREE_sti(uint32_t fetchdat)
+{
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    clear_C1();
+    FPU_settagi(X87_TAG_EMPTY, fetchdat & 7);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.ffree) : (x87_timings.ffree * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.ffree) : (x87_concurrency.ffree * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FFREEP_sti(uint32_t fetchdat)
+{
+    FP_ENTER();
+    FPU_check_pending_exceptions();
+    cpu_state.pc++;
+    clear_C1();
+    FPU_settagi(X87_TAG_EMPTY, fetchdat & 7);
+    if (cpu_state.abrt)
+        return 1;
+    FPU_pop();
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.ffree) : (x87_timings.ffree * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.ffree) : (x87_concurrency.ffree * cpu_multi));
+    return 0;
+}
--- a/src/cpu/softfloat/x87_ops_other.h
+++ b/src/cpu/softfloat/x87_ops_other.h
@@ -0,0 +1,593 @@
+static uint32_t
+fpu_save_environment(void)
+{
+    int tag;
+    unsigned offset;
+
+    /* read all registers in stack order and update x87 tag word */
+    for (int n = 0; n < 8; n++) {
+        // update tag only if it is not empty
+        if (!IS_TAG_EMPTY(n)) {
+            tag = FPU_tagof(FPU_read_regi(n));
+            FPU_settagi(tag, n);
+        }
+    }
+
+    fpu_state.swd = (fpu_state.swd & ~(7 << 11)) | ((fpu_state.tos & 7) << 11);
+
+    switch ((cr0 & 1) | (cpu_state.op32 & 0x100)) {
+        case 0x000: { /*16-bit real mode*/
+            uint16_t tmp;
+            uint32_t fp_ip, fp_dp;
+
+            fp_ip = ((uint32_t)(fpu_state.fcs << 4)) | fpu_state.fip;
+            fp_dp = ((uint32_t)(fpu_state.fds << 4)) | fpu_state.fdp;
+
+            tmp = i387_get_control_word();
+            writememw(easeg, cpu_state.eaaddr + 0x00, tmp);
+            tmp = i387_get_status_word();
+            writememw(easeg, cpu_state.eaaddr + 0x02, tmp);
+            tmp = fpu_state.tag;
+            writememw(easeg, cpu_state.eaaddr + 0x04, tmp);
+            tmp = fp_ip & 0xffff;
+            writememw(easeg, cpu_state.eaaddr + 0x06, tmp);
+            tmp = (uint16_t)((fp_ip & 0xf0000) >> 4) | fpu_state.foo;
+            writememw(easeg, cpu_state.eaaddr + 0x08, tmp);
+            tmp = fp_dp & 0xffff;
+            writememw(easeg, cpu_state.eaaddr + 0x0a, tmp);
+            tmp = (uint16_t)((fp_dp & 0xf0000) >> 4);
+            writememw(easeg, cpu_state.eaaddr + 0x0c, tmp);
+            offset = 0x0e;
+        }
+        break;
+        case 0x001: {/*16-bit protected mode*/
+            uint16_t tmp;
+            tmp = i387_get_control_word();
+            writememw(easeg, cpu_state.eaaddr + 0x00, tmp);
+            tmp = i387_get_status_word();
+            writememw(easeg, cpu_state.eaaddr + 0x02, tmp);
+            tmp = fpu_state.tag;
+            writememw(easeg, cpu_state.eaaddr + 0x04, tmp);
+            tmp = (uint16_t)(fpu_state.fip) & 0xffff;
+            writememw(easeg, cpu_state.eaaddr + 0x06, tmp);
+            tmp = fpu_state.fcs;
+            writememw(easeg, cpu_state.eaaddr + 0x08, tmp);
+            tmp = (uint16_t)(fpu_state.fdp) & 0xffff;
+            writememw(easeg, cpu_state.eaaddr + 0x0a, tmp);
+            tmp = fpu_state.fds;
+            writememw(easeg, cpu_state.eaaddr + 0x0c, tmp);
+            offset = 0x0e;
+        }
+        break;
+        case 0x100: { /*32-bit real mode*/
+            uint32_t tmp, fp_ip, fp_dp;
+
+            fp_ip = ((uint32_t)(fpu_state.fcs << 4)) | fpu_state.fip;
+            fp_dp = ((uint32_t)(fpu_state.fds << 4)) | fpu_state.fdp;
+
+            tmp = 0xffff0000 | i387_get_control_word();
+            writememl(easeg, cpu_state.eaaddr + 0x00, tmp);
+            tmp = 0xffff0000 | i387_get_status_word();
+            writememl(easeg, cpu_state.eaaddr + 0x04, tmp);
+            tmp = 0xffff0000 | fpu_state.tag;
+            writememl(easeg, cpu_state.eaaddr + 0x08, tmp);
+            tmp = 0xffff0000 | (fp_ip & 0xffff);
+            writememl(easeg, cpu_state.eaaddr + 0x0c, tmp);
+            tmp = ((fp_ip & 0xffff0000) >> 4) | fpu_state.foo;
+            writememl(easeg, cpu_state.eaaddr + 0x10, tmp);
+            tmp = 0xffff0000 | (fp_dp & 0xffff);
+            writememl(easeg, cpu_state.eaaddr + 0x14, tmp);
+            tmp = (fp_dp & 0xffff0000) >> 4;
+            writememl(easeg, cpu_state.eaaddr + 0x18, tmp);
+            offset = 0x1c;
+        }
+        break;
+        case 0x101: { /*32-bit protected mode*/
+            uint32_t tmp;
+            tmp = 0xffff0000 | i387_get_control_word();
+            writememl(easeg, cpu_state.eaaddr + 0x00, tmp);
+            tmp = 0xffff0000 | i387_get_status_word();
+            writememl(easeg, cpu_state.eaaddr + 0x04, tmp);
+            tmp = 0xffff0000 | fpu_state.tag;
+            writememl(easeg, cpu_state.eaaddr + 0x08, tmp);
+            tmp = (uint32_t)(fpu_state.fip);
+            writememl(easeg, cpu_state.eaaddr + 0x0c, tmp);
+            tmp = fpu_state.fcs | (((uint32_t)(fpu_state.foo)) << 16);
+            writememl(easeg, cpu_state.eaaddr + 0x10, tmp);
+            tmp = (uint32_t)(fpu_state.fdp);
+            writememl(easeg, cpu_state.eaaddr + 0x14, tmp);
+            tmp = 0xffff0000 | fpu_state.fds;
+            writememl(easeg, cpu_state.eaaddr + 0x18, tmp);
+            offset = 0x1c;
+        }
+        break;
+    }
+
+    return (cpu_state.eaaddr + offset);
+}
+
+static uint32_t
+fpu_load_environment(void)
+{
+    unsigned offset;
+
+    switch ((cr0 & 1) | (cpu_state.op32 & 0x100)) {
+        case 0x000: { /*16-bit real mode*/
+            uint16_t tmp;
+            uint32_t fp_ip, fp_dp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x0c);
+            fp_dp = (tmp & 0xf000) << 4;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x0a);
+            fpu_state.fdp = fp_dp | tmp;
+            fpu_state.fds = 0;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x08);
+            fp_ip = (tmp & 0xf000) << 4;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x06);
+            fpu_state.fip = fp_ip | tmp;
+            fpu_state.fcs = 0;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x04);
+            fpu_state.tag = tmp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x02);
+            fpu_state.swd = tmp;
+            fpu_state.tos = (tmp >> 11) & 7;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x00);
+            fpu_state.cwd = tmp;
+            offset = 0x0e;
+        }
+        break;
+        case 0x001: {/*16-bit protected mode*/
+            uint16_t tmp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x0c);
+            fpu_state.fds = tmp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x0a);
+            fpu_state.fdp = tmp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x08);
+            fpu_state.fcs = tmp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x06);
+            fpu_state.fip = tmp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x04);
+            fpu_state.tag = tmp;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x02);
+            fpu_state.swd = tmp;
+            fpu_state.tos = (tmp >> 11) & 7;
+            tmp = readmemw(easeg, cpu_state.eaaddr + 0x00);
+            fpu_state.cwd = tmp;
+            offset = 0x0e;
+        }
+        break;
+        case 0x100: { /*32-bit real mode*/
+            uint32_t tmp, fp_ip, fp_dp;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x18);
+            fp_dp = (tmp & 0x0ffff000) << 4;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x14);
+            fp_dp |= (tmp & 0xffff);
+            fpu_state.fdp = fp_dp;
+            fpu_state.fds = 0;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x10);
+            fpu_state.foo = tmp & 0x07ff;
+            fp_ip = (tmp & 0x0ffff000) << 4;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x0c);
+            fp_ip |= (tmp & 0xffff);
+            fpu_state.fip = fp_ip;
+            fpu_state.fcs = 0;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x08);
+            fpu_state.tag = tmp & 0xffff;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x04);
+            fpu_state.swd = tmp & 0xffff;
+            fpu_state.tos = (tmp >> 11) & 7;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x00);
+            fpu_state.cwd = tmp & 0xffff;
+            offset = 0x1c;
+        }
+        break;
+        case 0x101: { /*32-bit protected mode*/
+            uint32_t tmp;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x18);
+            fpu_state.fds = tmp & 0xffff;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x14);
+            fpu_state.fdp = tmp;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x10);
+            fpu_state.fcs = tmp & 0xffff;
+            fpu_state.foo = (tmp >> 16) & 0x07ff;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x0c);
+            fpu_state.fip = tmp;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x08);
+            fpu_state.tag = tmp & 0xffff;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x04);
+            fpu_state.swd = tmp & 0xffff;
+            fpu_state.tos = (tmp >> 11) & 7;
+            tmp = readmeml(easeg, cpu_state.eaaddr + 0x00);
+            fpu_state.cwd = tmp & 0xffff;
+            offset = 0x1c;
+        }
+        break;
+    }
+
+    /* always set bit 6 as '1 */
+    fpu_state.cwd = (fpu_state.cwd & ~FPU_CW_Reserved_Bits) | 0x0040;
+
+    /* check for unmasked exceptions */
+    if (fpu_state.swd & ~fpu_state.cwd & FPU_CW_Exceptions_Mask) {
+        /* set the B and ES bits in the status-word */
+        fpu_state.swd |= (FPU_SW_Summary | FPU_SW_Backward);
+    } else {
+        /* clear the B and ES bits in the status-word */
+        fpu_state.swd &= ~(FPU_SW_Summary | FPU_SW_Backward);
+    }
+
+    return (cpu_state.eaaddr + offset);
+}
+
+static int
+sf_FLDCW_a16(uint32_t fetchdat)
+{
+    uint16_t tempw;
+
+    FP_ENTER();
+    fetch_ea_16(fetchdat);
+    SEG_CHECK_READ(cpu_state.ea_seg);
+    tempw = geteaw();
+    if (cpu_state.abrt)
+        return 1;
+    fpu_state.cwd = (tempw & ~FPU_CW_Reserved_Bits) | 0x0040; // bit 6 is reserved as '1
+    /* check for unmasked exceptions */
+    if (fpu_state.swd & (~fpu_state.cwd & FPU_CW_Exceptions_Mask)) {
+        /* set the B and ES bits in the status-word */
+        fpu_state.swd |= (FPU_SW_Summary | FPU_SW_Backward);
+    } else {
+        /* clear the B and ES bits in the status-word */
+        fpu_state.swd &= ~(FPU_SW_Summary | FPU_SW_Backward);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fldcw) : (x87_timings.fldcw * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fldcw) : (x87_concurrency.fldcw * cpu_multi));
+    return 0;
+}
+#ifndef FPU_8087
+static int
+sf_FLDCW_a32(uint32_t fetchdat)
+{
+    uint16_t tempw;
+
+    FP_ENTER();
+    fetch_ea_32(fetchdat);
+    SEG_CHECK_READ(cpu_state.ea_seg);
+    tempw = geteaw();
+    if (cpu_state.abrt)
+        return 1;
+    fpu_state.cwd = (tempw & ~FPU_CW_Reserved_Bits) | 0x0040; // bit 6 is reserved as '1
+    /* check for unmasked exceptions */
+    if (fpu_state.swd & (~fpu_state.cwd & FPU_CW_Exceptions_Mask)) {
+        /* set the B and ES bits in the status-word */
+        fpu_state.swd |= (FPU_SW_Summary | FPU_SW_Backward);
+    } else {
+        /* clear the B and ES bits in the status-word */
+        fpu_state.swd &= ~(FPU_SW_Summary | FPU_SW_Backward);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fldcw) : (x87_timings.fldcw * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fldcw) : (x87_concurrency.fldcw * cpu_multi));
+    return 0;
+}
+#endif
+
+static int
+sf_FNSTCW_a16(uint32_t fetchdat)
+{
+    uint16_t cwd = i387_get_control_word();
+
+    FP_ENTER();
+    fetch_ea_16(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    seteaw(cwd);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fstcw_sw) : (x87_timings.fstcw_sw * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fstenv) : (x87_concurrency.fstenv * cpu_multi));
+    return cpu_state.abrt;
+}
+#ifndef FPU_8087
+static int
+sf_FNSTCW_a32(uint32_t fetchdat)
+{
+    uint16_t cwd = i387_get_control_word();
+
+    FP_ENTER();
+    fetch_ea_32(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    seteaw(cwd);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fstcw_sw) : (x87_timings.fstcw_sw * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fstcw_sw) : (x87_concurrency.fstcw_sw * cpu_multi));
+    return cpu_state.abrt;
+}
+#endif
+
+static int
+sf_FNSTSW_a16(uint32_t fetchdat)
+{
+    uint16_t swd = i387_get_status_word();
+
+    FP_ENTER();
+    fetch_ea_16(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    seteaw(swd);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fstcw_sw) : (x87_timings.fstcw_sw * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fstcw_sw) : (x87_concurrency.fstcw_sw * cpu_multi));
+    return cpu_state.abrt;
+}
+#ifndef FPU_8087
+static int
+sf_FNSTSW_a32(uint32_t fetchdat)
+{
+    uint16_t swd = i387_get_status_word();
+
+    FP_ENTER();
+    fetch_ea_32(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    seteaw(swd);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fstcw_sw) : (x87_timings.fstcw_sw * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fstcw_sw) : (x87_concurrency.fstcw_sw * cpu_multi));
+    return cpu_state.abrt;
+}
+#endif
+
+#ifdef FPU_8087
+static int
+sf_FI(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    fpu_state.cwd &= ~FPU_SW_Summary;
+    if (rmdat == 0xe1)
+        fpu_state.cwd |= FPU_SW_Summary;
+    wait(3, 0);
+    return 0;
+}
+#else
+static int
+sf_FNSTSW_AX(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    AX = i387_get_status_word();
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fstcw_sw) : (x87_timings.fstcw_sw * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fstcw_sw) : (x87_concurrency.fstcw_sw * cpu_multi));
+    return 0;
+}
+#endif
+
+static int
+sf_FRSTOR_a16(uint32_t fetchdat)
+{
+    floatx80 tmp;
+    int offset;
+
+    FP_ENTER();
+    fetch_ea_16(fetchdat);
+    SEG_CHECK_READ(cpu_state.ea_seg);
+    offset = fpu_load_environment();
+    for (int n = 0; n < 8; n++) {
+        tmp.fraction = readmemq(easeg, offset + (n * 10));
+        tmp.exp = readmemw(easeg, offset + (n * 10) + 8);
+        FPU_save_regi_tag(tmp, IS_TAG_EMPTY(n) ? X87_TAG_EMPTY : FPU_tagof(tmp), n);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.frstor) : (x87_timings.frstor * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.frstor) : (x87_concurrency.frstor * cpu_multi));
+    return cpu_state.abrt;
+}
+#ifndef FPU_8087
+static int
+sf_FRSTOR_a32(uint32_t fetchdat)
+{
+    floatx80 tmp;
+    int offset;
+
+    FP_ENTER();
+    fetch_ea_32(fetchdat);
+    SEG_CHECK_READ(cpu_state.ea_seg);
+    offset = fpu_load_environment();
+    for (int n = 0; n < 8; n++) {
+        tmp.fraction = readmemq(easeg, offset + (n * 10));
+        tmp.exp = readmemw(easeg, offset + (n * 10) + 8);
+        FPU_save_regi_tag(tmp, IS_TAG_EMPTY(n) ? X87_TAG_EMPTY : FPU_tagof(tmp), n);
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.frstor) : (x87_timings.frstor * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.frstor) : (x87_concurrency.frstor * cpu_multi));
+    return cpu_state.abrt;
+}
+#endif
+
+static int
+sf_FNSAVE_a16(uint32_t fetchdat)
+{
+    floatx80 stn;
+    int offset;
+
+    FP_ENTER();
+    fetch_ea_16(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    offset = fpu_save_environment();
+    /* save all registers in stack order. */
+    for (int m = 0; m < 8; m++) {
+        stn = FPU_read_regi(m);
+        writememq(easeg, offset + (m * 10), stn.fraction);
+        writememw(easeg, offset + (m * 10) + 8, stn.exp);
+    }
+
+#ifdef FPU_8087
+    fpu_state.swd = 0x3FF;
+#else
+    fpu_state.cwd = 0x37F;
+#endif
+    fpu_state.swd = 0;
+    fpu_state.tos = 0;
+    fpu_state.tag = 0xffff;
+    cpu_state.ismmx = 0;
+    fpu_state.foo = 0;
+    fpu_state.fds = 0;
+    fpu_state.fdp = 0;
+    fpu_state.fcs = 0;
+    fpu_state.fip = 0;
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fsave) : (x87_timings.fsave * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fsave) : (x87_concurrency.fsave * cpu_multi));
+    return cpu_state.abrt;
+}
+#ifndef FPU_8087
+static int
+sf_FNSAVE_a32(uint32_t fetchdat)
+{
+    floatx80 stn;
+    int offset;
+
+    FP_ENTER();
+    fetch_ea_32(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    offset = fpu_save_environment();
+    /* save all registers in stack order. */
+    for (int m = 0; m < 8; m++) {
+        stn = FPU_read_regi(m);
+        writememq(easeg, offset + (m * 10), stn.fraction);
+        writememw(easeg, offset + (m * 10) + 8, stn.exp);
+    }
+
+#ifdef FPU_8087
+    fpu_state.swd = 0x3FF;
+#else
+    fpu_state.cwd = 0x37F;
+#endif
+    fpu_state.swd = 0;
+    fpu_state.tos = 0;
+    fpu_state.tag = 0xffff;
+    cpu_state.ismmx = 0;
+    fpu_state.foo = 0;
+    fpu_state.fds = 0;
+    fpu_state.fdp = 0;
+    fpu_state.fcs = 0;
+    fpu_state.fip = 0;
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fsave) : (x87_timings.fsave * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fsave) : (x87_concurrency.fsave * cpu_multi));
+    return cpu_state.abrt;
+}
+#endif
+
+static int
+sf_FNCLEX(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    fpu_state.swd &= ~(FPU_SW_Backward | FPU_SW_Summary | FPU_SW_Stack_Fault | FPU_SW_Precision |
+               FPU_SW_Underflow | FPU_SW_Overflow | FPU_SW_Zero_Div | FPU_SW_Denormal_Op |
+               FPU_SW_Invalid);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fnop) : (x87_timings.fnop * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fnop) : (x87_concurrency.fnop * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FNINIT(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+#ifdef FPU_8087
+    fpu_state.cwd = 0x3FF;
+#else
+    fpu_state.cwd = 0x37F;
+#endif
+    fpu_state.swd = 0;
+    fpu_state.tos   = 0;
+    fpu_state.tag = 0xffff;
+    fpu_state.foo = 0;
+    fpu_state.fds = 0;
+    fpu_state.fdp = 0;
+    fpu_state.fcs = 0;
+    fpu_state.fip = 0;
+    cpu_state.ismmx = 0;
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.finit) : (x87_timings.finit * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.finit) : (x87_concurrency.finit * cpu_multi));
+    CPU_BLOCK_END();
+    return 0;
+}
+
+static int
+sf_FLDENV_a16(uint32_t fetchdat)
+{
+    int tag;
+
+    FP_ENTER();
+    fetch_ea_16(fetchdat);
+    SEG_CHECK_READ(cpu_state.ea_seg);
+    fpu_load_environment();
+    /* read all registers in stack order and update x87 tag word */
+    for (int n = 0; n < 8; n++) {
+        // update tag only if it is not empty
+        if (!IS_TAG_EMPTY(n)) {
+            tag = FPU_tagof(FPU_read_regi(n));
+            FPU_settagi(tag, n);
+        }
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fldenv) : (x87_timings.fldenv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fldenv) : (x87_concurrency.fldenv * cpu_multi));
+    return cpu_state.abrt;
+}
+#ifndef FPU_8087
+static int
+sf_FLDENV_a32(uint32_t fetchdat)
+{
+    int tag;
+
+    FP_ENTER();
+    fetch_ea_32(fetchdat);
+    SEG_CHECK_READ(cpu_state.ea_seg);
+    fpu_load_environment();
+    /* read all registers in stack order and update x87 tag word */
+    for (int n = 0; n < 8; n++) {
+        // update tag only if it is not empty
+        if (!IS_TAG_EMPTY(n)) {
+            tag = FPU_tagof(FPU_read_regi(n));
+            FPU_settagi(tag, n);
+        }
+    }
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fldenv) : (x87_timings.fldenv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fldenv) : (x87_concurrency.fldenv * cpu_multi));
+    return cpu_state.abrt;
+}
+#endif
+
+static int
+sf_FNSTENV_a16(uint32_t fetchdat)
+{
+    FP_ENTER();
+    fetch_ea_16(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    fpu_save_environment();
+    /* mask all floating point exceptions */
+    fpu_state.cwd |= FPU_CW_Exceptions_Mask;
+    /* clear the B and ES bits in the status word */
+    fpu_state.swd &= ~(FPU_SW_Backward|FPU_SW_Summary);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fstenv) : (x87_timings.fstenv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fstenv) : (x87_concurrency.fstenv * cpu_multi));
+    return cpu_state.abrt;
+}
+#ifndef FPU_8087
+static int
+sf_FNSTENV_a32(uint32_t fetchdat)
+{
+    FP_ENTER();
+    fetch_ea_32(fetchdat);
+    SEG_CHECK_WRITE(cpu_state.ea_seg);
+    fpu_save_environment();
+    /* mask all floating point exceptions */
+    fpu_state.cwd |= FPU_CW_Exceptions_Mask;
+    /* clear the B and ES bits in the status word */
+    fpu_state.swd &= ~(FPU_SW_Backward|FPU_SW_Summary);
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fstenv) : (x87_timings.fstenv * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fstenv) : (x87_concurrency.fstenv * cpu_multi));
+    return cpu_state.abrt;
+}
+#endif
+
+static int
+sf_FNOP(uint32_t fetchdat)
+{
+    FP_ENTER();
+    cpu_state.pc++;
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fnop) : (x87_timings.fnop * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fnop) : (x87_concurrency.fnop * cpu_multi));
+    return 0;
+}
--- a/src/cpu/softfloat/x87_ops_trans.h
+++ b/src/cpu/softfloat/x87_ops_trans.h
@@ -0,0 +1,418 @@
+static int
+sf_F2XM1(uint32_t fetchdat)
+{
+    floatx80 result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    result = f2xm1(FPU_read_regi(0), &status);
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.f2xm1) : (x87_timings.f2xm1 * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.f2xm1) : (x87_concurrency.f2xm1 * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FYL2X(uint32_t fetchdat)
+{
+    floatx80 result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_stack_underflow(fetchdat, 1, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    result = fyl2x(FPU_read_regi(0), FPU_read_regi(1), &status);
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_pop();
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fyl2x) : (x87_timings.fyl2x * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fyl2x) : (x87_concurrency.fyl2x * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FPTAN(uint32_t fetchdat)
+{
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+    floatx80 y;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    clear_C2();
+    if (IS_TAG_EMPTY(0) || !IS_TAG_EMPTY(-1)) {
+        if (IS_TAG_EMPTY(0))
+            FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        else
+            FPU_exception(fetchdat, FPU_EX_Stack_Overflow, 0);
+
+        /* The masked response */
+        if (is_IA_masked()) {
+            FPU_save_regi(floatx80_default_nan, 0);
+            FPU_push();
+            FPU_save_regi(floatx80_default_nan, 0);
+        }
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    y = FPU_read_regi(0);
+    if (ftan(&y, &status) == -1) {
+        fpu_state.swd |= C2;
+        goto next_ins;
+    }
+
+    if (floatx80_is_nan(y)) {
+        if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+            FPU_save_regi(y, 0);
+            FPU_push();
+            FPU_save_regi(y, 0);
+        }
+        goto next_ins;
+    }
+
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(y, 0);
+        FPU_push();
+        FPU_save_regi(Const_1, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fptan) : (x87_timings.fptan * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fptan) : (x87_concurrency.fptan * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FPATAN(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_stack_underflow(fetchdat, 1, 1);
+        goto next_ins;
+    }
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(1);
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    result = fpatan(a, b, &status);
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_pop();
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fpatan) : (x87_timings.fpatan * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fpatan) : (x87_concurrency.fpatan * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FXTRACT(uint32_t fetchdat)
+{
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+    struct float_status_t status;
+    floatx80 a, b;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+
+#if 0 //TODO
+    if ((IS_TAG_EMPTY(0) || IS_TAG_EMPTY(-1))) {
+        if (IS_TAG_EMPTY(0))
+            FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        else
+            FPU_exception(fetchdat, FPU_EX_Stack_Overflow, 0);
+
+        /* The masked response */
+        if (is_IA_masked()) {
+            FPU_save_regi(floatx80_default_nan, 0);
+            FPU_push();
+            FPU_save_regi(floatx80_default_nan, 0);
+        }
+        goto next_ins;
+    }
+#endif
+
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = floatx80_extract(&a, &status);
+    if (!FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(b, 0); // exponent
+        FPU_push();
+        FPU_save_regi(a, 0); // fraction
+    }
+
+#if 0 //TODO.
+next_ins:
+#endif
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fxtract) : (x87_timings.fxtract * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fxtract) : (x87_concurrency.fxtract * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FPREM1(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+    uint64_t quotient = 0;
+    int flags, cc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    clear_C2();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(1);
+    flags = floatx80_ieee754_remainder(a, b, &result, &quotient, &status);
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        if (flags >= 0) {
+            cc = 0;
+            if (flags)
+                cc = C2;
+            else {
+                if (quotient & 1)
+                    cc |= C1;
+                if (quotient & 2)
+                    cc |= C3;
+                if (quotient & 4)
+                    cc |= C0;
+            }
+            setcc(cc);
+        }
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fprem1) : (x87_timings.fprem1 * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fprem1) : (x87_concurrency.fprem1 * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FPREM(uint32_t fetchdat)
+{
+    floatx80 a, b, result;
+    struct float_status_t status;
+    uint64_t quotient = 0;
+    int flags, cc;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    clear_C2();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    a = FPU_read_regi(0);
+    b = FPU_read_regi(1);
+    // handle unsupported extended double-precision floating encodings
+    flags = floatx80_remainder(a, b, &result, &quotient, &status);
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        if (flags >= 0) {
+            cc = 0;
+            if (flags)
+                cc = C2;
+            else {
+                if (quotient & 1)
+                    cc |= C1;
+                if (quotient & 2)
+                    cc |= C3;
+                if (quotient & 4)
+                    cc |= C0;
+            }
+            setcc(cc);
+        }
+        FPU_save_regi(result, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fprem) : (x87_timings.fprem * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fprem) : (x87_concurrency.fprem * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FYL2XP1(uint32_t fetchdat)
+{
+    floatx80 result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_stack_underflow(fetchdat, 1, 1);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    result = fyl2xp1(FPU_read_regi(0), FPU_read_regi(1), &status);
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(result, 1);
+        FPU_pop();
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fyl2xp1) : (x87_timings.fyl2xp1 * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fyl2xp1) : (x87_concurrency.fyl2xp1 * cpu_multi));
+    return 0;
+}
+
+#ifndef FPU_8087
+static int
+sf_FSINCOS(uint32_t fetchdat)
+{
+    const floatx80 floatx80_default_nan = packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction);
+    struct float_status_t status;
+    floatx80 y, sin_y, cos_y;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    clear_C2();
+    if (IS_TAG_EMPTY(0) || !IS_TAG_EMPTY(-1)) {
+        if (IS_TAG_EMPTY(0))
+            FPU_exception(fetchdat, FPU_EX_Stack_Underflow, 0);
+        else
+            FPU_exception(fetchdat, FPU_EX_Stack_Overflow, 0);
+
+        /* The masked response */
+        if (is_IA_masked()) {
+            FPU_save_regi(floatx80_default_nan, 0);
+            FPU_push();
+            FPU_save_regi(floatx80_default_nan, 0);
+        }
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    y = FPU_read_regi(0);
+    if (fsincos(y, &sin_y, &cos_y, &status) == -1) {
+        fpu_state.swd |= C2;
+        goto next_ins;
+    }
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0)) {
+        FPU_save_regi(sin_y, 0);
+        FPU_push();
+        FPU_save_regi(cos_y, 0);
+    }
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fsincos) : (x87_timings.fsincos * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fsincos) : (x87_concurrency.fsincos * cpu_multi));
+    return 0;
+}
+#endif
+
+static int
+sf_FSCALE(uint32_t fetchdat)
+{
+    floatx80 result;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    if (IS_TAG_EMPTY(0) || IS_TAG_EMPTY(1)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word());
+    result = floatx80_scale(FPU_read_regi(0), FPU_read_regi(1), &status);
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(result, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fscale) : (x87_timings.fscale * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fscale) : (x87_concurrency.fscale * cpu_multi));
+    return 0;
+}
+
+#ifndef FPU_8087
+static int
+sf_FSIN(uint32_t fetchdat)
+{
+    floatx80 y;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    clear_C2();
+    if (IS_TAG_EMPTY(0)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    y = FPU_read_regi(0);
+    if (fsin(&y, &status) == -1) {
+        fpu_state.swd |= C2;
+        goto next_ins;
+    }
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(y, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fsin_cos) : (x87_timings.fsin_cos * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fsin_cos) : (x87_concurrency.fsin_cos * cpu_multi));
+    return 0;
+}
+
+static int
+sf_FCOS(uint32_t fetchdat)
+{
+    floatx80 y;
+    struct float_status_t status;
+
+    FP_ENTER();
+    cpu_state.pc++;
+    clear_C1();
+    clear_C2();
+    if (IS_TAG_EMPTY(0)) {
+        FPU_stack_underflow(fetchdat, 0, 0);
+        goto next_ins;
+    }
+    status = i387cw_to_softfloat_status_word(i387_get_control_word() | FPU_PR_80_BITS);
+    y = FPU_read_regi(0);
+    if (fcos(&y, &status) == -1) {
+        fpu_state.swd |= C2;
+        goto next_ins;
+    }
+    if (! FPU_exception(fetchdat, status.float_exception_flags, 0))
+        FPU_save_regi(y, 0);
+
+next_ins:
+    CLOCK_CYCLES_FPU((fpu_type >= FPU_487SX) ? (x87_timings.fsin_cos) : (x87_timings.fsin_cos * cpu_multi));
+    CONCURRENCY_CYCLES((fpu_type >= FPU_487SX) ? (x87_concurrency.fsin_cos) : (x87_concurrency.fsin_cos * cpu_multi));
+    return 0;
+}
+#endif