diff --git a/CMakeLists.txt b/CMakeLists.txt
index e542e217c5..b002641415 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -397,6 +397,7 @@ target_link_libraries(core_interface INTERFACE warn_interface)
 if(MSVC)
   try_append_cxx_flags("/W3" TARGET warn_interface SKIP_LINK)
   try_append_cxx_flags("/wd4018" TARGET warn_interface SKIP_LINK)
+  try_append_cxx_flags("/wd4146" TARGET warn_interface SKIP_LINK)
   try_append_cxx_flags("/wd4244" TARGET warn_interface SKIP_LINK)
   try_append_cxx_flags("/wd4267" TARGET warn_interface SKIP_LINK)
   try_append_cxx_flags("/wd4715" TARGET warn_interface SKIP_LINK)
diff --git a/src/arith_uint256.cpp b/src/arith_uint256.cpp
index 0d5b3d5b0e..850fc624b8 100644
--- a/src/arith_uint256.cpp
+++ b/src/arith_uint256.cpp
@@ -229,3 +229,7 @@ arith_uint256 UintToArith256(const uint256 &a)
         b.pn[x] = ReadLE32(a.begin() + x*4);
     return b;
 }
+
+// Explicit instantiations for base_uint<6144> (used in test/fuzz/muhash.cpp).
+template base_uint<6144>& base_uint<6144>::operator*=(const base_uint<6144>& b);
+template base_uint<6144>& base_uint<6144>::operator/=(const base_uint<6144>& b);
diff --git a/src/bench/crypto_hash.cpp b/src/bench/crypto_hash.cpp
index a0799aea7a..2f1ff56438 100644
--- a/src/bench/crypto_hash.cpp
+++ b/src/bench/crypto_hash.cpp
@@ -249,6 +249,19 @@ static void MuHashPrecompute(benchmark::Bench& bench)
     });
 }
 
+static void MuHashFinalize(benchmark::Bench& bench)
+{
+    FastRandomContext rng(true);
+    MuHash3072 acc{rng.randbytes(32)};
+    acc /= MuHash3072{rng.rand256()};
+
+    bench.run([&] {
+        uint256 out;
+        acc.Finalize(out);
+        acc /= MuHash3072{out};
+    });
+}
+
 BENCHMARK(BenchRIPEMD160, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SHA1, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SHA256_STANDARD, benchmark::PriorityLevel::HIGH);
@@ -272,3 +285,4 @@ BENCHMARK(MuHash, benchmark::PriorityLevel::HIGH);
 BENCHMARK(MuHashMul, benchmark::PriorityLevel::HIGH);
 BENCHMARK(MuHashDiv, benchmark::PriorityLevel::HIGH);
 BENCHMARK(MuHashPrecompute, benchmark::PriorityLevel::HIGH);
+BENCHMARK(MuHashFinalize, benchmark::PriorityLevel::HIGH);
diff --git a/src/crypto/muhash.cpp b/src/crypto/muhash.cpp
index 9c35b0689d..4f502269f5 100644
--- a/src/crypto/muhash.cpp
+++ b/src/crypto/muhash.cpp
@@ -7,7 +7,9 @@
 #include <crypto/chacha20.h>
 #include <crypto/common.h>
 #include <hash.h>
+#include <util/check.h>
 
+#include <bit>
 #include <cassert>
 #include <cstdio>
 #include <limits>
@@ -15,10 +17,22 @@
 namespace {
 
 using limb_t = Num3072::limb_t;
+using signed_limb_t = Num3072::signed_limb_t;
 using double_limb_t = Num3072::double_limb_t;
+using signed_double_limb_t = Num3072::signed_double_limb_t;
 constexpr int LIMB_SIZE = Num3072::LIMB_SIZE;
+constexpr int SIGNED_LIMB_SIZE = Num3072::SIGNED_LIMB_SIZE;
+constexpr int LIMBS = Num3072::LIMBS;
+constexpr int SIGNED_LIMBS = Num3072::SIGNED_LIMBS;
+constexpr int FINAL_LIMB_POSITION = 3072 / SIGNED_LIMB_SIZE;
+constexpr int FINAL_LIMB_MODULUS_BITS = 3072 % SIGNED_LIMB_SIZE;
+constexpr limb_t MAX_LIMB = (limb_t)(-1);
+constexpr limb_t MAX_SIGNED_LIMB = (((limb_t)1) << SIGNED_LIMB_SIZE) - 1;
 /** 2^3072 - 1103717, the largest 3072-bit safe prime number, is used as the modulus. */
 constexpr limb_t MAX_PRIME_DIFF = 1103717;
+/** The modular inverse of (2**3072 - MAX_PRIME_DIFF) mod (MAX_SIGNED_LIMB + 1). */
+constexpr limb_t MODULUS_INVERSE = limb_t(0x70a1421da087d93);
+
 
 /** Extract the lowest limb of [c0,c1,c2] into n, and left shift the number by 1 limb. */
 inline void extract3(limb_t& c0, limb_t& c1, limb_t& c2, limb_t& n)
@@ -72,23 +86,6 @@ inline void muladd3(limb_t& c0, limb_t& c1, limb_t& c2, const limb_t& a, const l
     c2 += (c1 < th) ? 1 : 0;
 }
 
-/** [c0,c1,c2] += 2 * a * b */
-inline void muldbladd3(limb_t& c0, limb_t& c1, limb_t& c2, const limb_t& a, const limb_t& b)
-{
-    double_limb_t t = (double_limb_t)a * b;
-    limb_t th = t >> LIMB_SIZE;
-    limb_t tl = t;
-
-    c0 += tl;
-    limb_t tt = th + ((c0 < tl) ? 1 : 0);
-    c1 += tt;
-    c2 += (c1 < tt) ? 1 : 0;
-    c0 += tl;
-    th += (c0 < tl) ? 1 : 0;
-    c1 += th;
-    c2 += (c1 < th) ? 1 : 0;
-}
-
 /**
  * Add limb a to [c0,c1]: [c0,c1] += a. Then extract the lowest
  * limb of [c0,c1] into n, and left shift the number by 1 limb.
@@ -103,8 +100,7 @@ inline void addnextract2(limb_t& c0, limb_t& c1, const limb_t& a, limb_t& n)
         c1 += 1;
 
         // Handle case when c1 has overflown
-        if (c1 == 0)
-            c2 = 1;
+        if (c1 == 0) c2 = 1;
     }
 
     // extract
@@ -113,13 +109,6 @@ inline void addnextract2(limb_t& c0, limb_t& c1, const limb_t& a, limb_t& n)
     c1 = c2;
 }
 
-/** in_out = in_out^(2^sq) * mul */
-inline void square_n_mul(Num3072& in_out, const int sq, const Num3072& mul)
-{
-    for (int j = 0; j < sq; ++j) in_out.Square();
-    in_out.Multiply(mul);
-}
-
 } // namespace
 
 /** Indicates whether d is larger than the modulus. */
@@ -141,41 +130,326 @@ void Num3072::FullReduce()
     }
 }
 
-Num3072 Num3072::GetInverse() const
+namespace {
+/** A type representing a number in signed limb representation. */
+struct Num3072Signed
 {
-    // For fast exponentiation a sliding window exponentiation with repunit
-    // precomputation is utilized. See "Fast Point Decompression for Standard
-    // Elliptic Curves" (Brumley, Järvinen, 2008).
+    /** The represented value is sum(limbs[i]*2^(SIGNED_LIMB_SIZE*i), i=0..SIGNED_LIMBS-1).
+     *  Note that limbs may be negative, or exceed 2^SIGNED_LIMB_SIZE-1. */
+    signed_limb_t limbs[SIGNED_LIMBS];
 
-    Num3072 p[12]; // p[i] = a^(2^(2^i)-1)
-    Num3072 out;
-
-    p[0] = *this;
-
-    for (int i = 0; i < 11; ++i) {
-        p[i + 1] = p[i];
-        for (int j = 0; j < (1 << i); ++j) p[i + 1].Square();
-        p[i + 1].Multiply(p[i]);
+    /** Construct a Num3072Signed with value 0. */
+    Num3072Signed()
+    {
+        memset(limbs, 0, sizeof(limbs));
     }
 
-    out = p[11];
+    /** Convert a Num3072 to a Num3072Signed. Output will be normalized and in
+     *  range 0..2^3072-1. */
+    void FromNum3072(const Num3072& in)
+    {
+        double_limb_t c = 0;
+        int b = 0, outpos = 0;
+        for (int i = 0; i < LIMBS; ++i) {
+            c += double_limb_t{in.limbs[i]} << b;
+            b += LIMB_SIZE;
+            while (b >= SIGNED_LIMB_SIZE) {
+                limbs[outpos++] = limb_t(c) & MAX_SIGNED_LIMB;
+                c >>= SIGNED_LIMB_SIZE;
+                b -= SIGNED_LIMB_SIZE;
+            }
+        }
+        Assume(outpos == SIGNED_LIMBS - 1);
+        limbs[SIGNED_LIMBS - 1] = c;
+        c >>= SIGNED_LIMB_SIZE;
+        Assume(c == 0);
+    }
 
-    square_n_mul(out, 512, p[9]);
-    square_n_mul(out, 256, p[8]);
-    square_n_mul(out, 128, p[7]);
-    square_n_mul(out, 64, p[6]);
-    square_n_mul(out, 32, p[5]);
-    square_n_mul(out, 8, p[3]);
-    square_n_mul(out, 2, p[1]);
-    square_n_mul(out, 1, p[0]);
-    square_n_mul(out, 5, p[2]);
-    square_n_mul(out, 3, p[0]);
-    square_n_mul(out, 2, p[0]);
-    square_n_mul(out, 4, p[0]);
-    square_n_mul(out, 4, p[1]);
-    square_n_mul(out, 3, p[0]);
+    /** Convert a Num3072Signed to a Num3072. Input must be in range 0..modulus-1. */
+    void ToNum3072(Num3072& out) const
+    {
+        double_limb_t c = 0;
+        int b = 0, outpos = 0;
+        for (int i = 0; i < SIGNED_LIMBS; ++i) {
+            c += double_limb_t(limbs[i]) << b;
+            b += SIGNED_LIMB_SIZE;
+            if (b >= LIMB_SIZE) {
+                out.limbs[outpos++] = c;
+                c >>= LIMB_SIZE;
+                b -= LIMB_SIZE;
+            }
+        }
+        Assume(outpos == LIMBS);
+        Assume(c == 0);
+    }
 
-    return out;
+    /** Take a Num3072Signed in range 1-2*2^3072..2^3072-1, and:
+     *  - optionally negate it (if negate is true)
+     *  - reduce it modulo the modulus (2^3072 - MAX_PRIME_DIFF)
+     *  - produce output with all limbs in range 0..2^SIGNED_LIMB_SIZE-1
+     */
+    void Normalize(bool negate)
+    {
+        // Add modulus if this was negative. This brings the range of *this to 1-2^3072..2^3072-1.
+        signed_limb_t cond_add = limbs[SIGNED_LIMBS-1] >> (LIMB_SIZE-1); // -1 if this is negative; 0 otherwise
+        limbs[0] += signed_limb_t(-MAX_PRIME_DIFF) & cond_add;
+        limbs[FINAL_LIMB_POSITION] += (signed_limb_t(1) << FINAL_LIMB_MODULUS_BITS) & cond_add;
+        // Next negate all limbs if negate was set. This does not change the range of *this.
+        signed_limb_t cond_negate = -signed_limb_t(negate); // -1 if this negate is true; 0 otherwise
+        for (int i = 0; i < SIGNED_LIMBS; ++i) {
+            limbs[i] = (limbs[i] ^ cond_negate) - cond_negate;
+        }
+        // Perform carry (make all limbs except the top one be in range 0..2^SIGNED_LIMB_SIZE-1).
+        for (int i = 0; i < SIGNED_LIMBS - 1; ++i) {
+            limbs[i + 1] += limbs[i] >> SIGNED_LIMB_SIZE;
+            limbs[i] &= MAX_SIGNED_LIMB;
+        }
+        // Again add modulus if *this was negative. This brings the range of *this to 0..2^3072-1.
+        cond_add = limbs[SIGNED_LIMBS-1] >> (LIMB_SIZE-1); // -1 if this is negative; 0 otherwise
+        limbs[0] += signed_limb_t(-MAX_PRIME_DIFF) & cond_add;
+        limbs[FINAL_LIMB_POSITION] += (signed_limb_t(1) << FINAL_LIMB_MODULUS_BITS) & cond_add;
+        // Perform another carry. Now all limbs are in range 0..2^SIGNED_LIMB_SIZE-1.
+        for (int i = 0; i < SIGNED_LIMBS - 1; ++i) {
+            limbs[i + 1] += limbs[i] >> SIGNED_LIMB_SIZE;
+            limbs[i] &= MAX_SIGNED_LIMB;
+        }
+    }
+};
+
+/** 2x2 transformation matrix with signed_limb_t elements. */
+struct SignedMatrix
+{
+    signed_limb_t u, v, q, r;
+};
+
+/** Compute the transformation matrix for SIGNED_LIMB_SIZE divsteps.
+ *
+ * eta: initial eta value
+ * f:   bottom SIGNED_LIMB_SIZE bits of initial f value
+ * g:   bottom SIGNED_LIMB_SIZE bits of initial g value
+ * out: resulting transformation matrix, scaled by 2^SIGNED_LIMB_SIZE
+ * return: eta value after SIGNED_LIMB_SIZE divsteps
+ */
+inline limb_t ComputeDivstepMatrix(signed_limb_t eta, limb_t f, limb_t g, SignedMatrix& out)
+{
+    /** inv256[i] = -1/(2*i+1) (mod 256) */
+    static const uint8_t NEGINV256[128] = {
+        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
+        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
+        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
+        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
+        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
+        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
+        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
+        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
+        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
+        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
+        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
+    };
+    // Coefficients of returned SignedMatrix; starts off as identity matrix. */
+    limb_t u = 1, v = 0, q = 0, r = 1;
+    // The number of divsteps still left.
+    int i = SIGNED_LIMB_SIZE;
+    while (true) {
+        /* Use a sentinel bit to count zeros only up to i. */
+        int zeros = std::countr_zero(g | (MAX_LIMB << i));
+        /* Perform zeros divsteps at once; they all just divide g by two. */
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+         /* We're done once we've performed SIGNED_LIMB_SIZE divsteps. */
+        if (i == 0) break;
+        /* If eta is negative, negate it and replace f,g with g,-f. */
+        if (eta < 0) {
+            limb_t tmp;
+            eta = -eta;
+            tmp = f; f = g; g = -tmp;
+            tmp = u; u = q; q = -tmp;
+            tmp = v; v = r; r = -tmp;
+        }
+        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
+         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
+         * can be done as its sign will flip once that happens. */
+        int limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+        /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
+        limb_t m = (MAX_LIMB >> (LIMB_SIZE - limit)) & 255U;
+        /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
+        limb_t w = (g * NEGINV256[(f >> 1) & 127]) & m;
+        /* Do so. */
+        g += f * w;
+        q += u * w;
+        r += v * w;
+    }
+    out.u = (signed_limb_t)u;
+    out.v = (signed_limb_t)v;
+    out.q = (signed_limb_t)q;
+    out.r = (signed_limb_t)r;
+    return eta;
+}
+
+/** Apply matrix t/2^SIGNED_LIMB_SIZE to vector [d,e], modulo modulus.
+ *
+ * On input and output, d and e are in range 1-2*modulus..modulus-1.
+ */
+inline void UpdateDE(Num3072Signed& d, Num3072Signed& e, const SignedMatrix& t)
+{
+    const signed_limb_t u = t.u, v=t.v, q=t.q, r=t.r;
+
+    /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
+    signed_limb_t sd = d.limbs[SIGNED_LIMBS - 1] >> (LIMB_SIZE - 1);
+    signed_limb_t se = e.limbs[SIGNED_LIMBS - 1] >> (LIMB_SIZE - 1);
+    signed_limb_t md = (u & sd) + (v & se);
+    signed_limb_t me = (q & sd) + (r & se);
+    /* Begin computing t*[d,e]. */
+    signed_limb_t di = d.limbs[0], ei = e.limbs[0];
+    signed_double_limb_t cd = (signed_double_limb_t)u * di + (signed_double_limb_t)v * ei;
+    signed_double_limb_t ce = (signed_double_limb_t)q * di + (signed_double_limb_t)r * ei;
+    /* Correct md,me so that t*[d,e]+modulus*[md,me] has SIGNED_LIMB_SIZE zero bottom bits. */
+    md -= (MODULUS_INVERSE * limb_t(cd) + md) & MAX_SIGNED_LIMB;
+    me -= (MODULUS_INVERSE * limb_t(ce) + me) & MAX_SIGNED_LIMB;
+    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
+    cd -= (signed_double_limb_t)1103717 * md;
+    ce -= (signed_double_limb_t)1103717 * me;
+    /* Verify that the low SIGNED_LIMB_SIZE bits of the computation are indeed zero, and then throw them away. */
+    Assume((cd & MAX_SIGNED_LIMB) == 0);
+    Assume((ce & MAX_SIGNED_LIMB) == 0);
+    cd >>= SIGNED_LIMB_SIZE;
+    ce >>= SIGNED_LIMB_SIZE;
+    /* Now iteratively compute limb i=1..SIGNED_LIMBS-2 of t*[d,e]+modulus*[md,me], and store them in output
+     * limb i-1 (shifting down by SIGNED_LIMB_SIZE bits). The corresponding limbs in modulus are all zero,
+     * so modulus/md/me are not actually involved here. */
+    for (int i = 1; i < SIGNED_LIMBS - 1; ++i) {
+        di = d.limbs[i];
+        ei = e.limbs[i];
+        cd += (signed_double_limb_t)u * di + (signed_double_limb_t)v * ei;
+        ce += (signed_double_limb_t)q * di + (signed_double_limb_t)r * ei;
+        d.limbs[i - 1] = (signed_limb_t)cd & MAX_SIGNED_LIMB; cd >>= SIGNED_LIMB_SIZE;
+        e.limbs[i - 1] = (signed_limb_t)ce & MAX_SIGNED_LIMB; ce >>= SIGNED_LIMB_SIZE;
+    }
+    /* Compute limb SIGNED_LIMBS-1 of t*[d,e]+modulus*[md,me], and store it in output limb SIGNED_LIMBS-2. */
+    di = d.limbs[SIGNED_LIMBS - 1];
+    ei = e.limbs[SIGNED_LIMBS - 1];
+    cd += (signed_double_limb_t)u * di + (signed_double_limb_t)v * ei;
+    ce += (signed_double_limb_t)q * di + (signed_double_limb_t)r * ei;
+    cd += (signed_double_limb_t)md << FINAL_LIMB_MODULUS_BITS;
+    ce += (signed_double_limb_t)me << FINAL_LIMB_MODULUS_BITS;
+    d.limbs[SIGNED_LIMBS - 2] = (signed_limb_t)cd & MAX_SIGNED_LIMB; cd >>= SIGNED_LIMB_SIZE;
+    e.limbs[SIGNED_LIMBS - 2] = (signed_limb_t)ce & MAX_SIGNED_LIMB; ce >>= SIGNED_LIMB_SIZE;
+    /* What remains goes into output limb SINGED_LIMBS-1 */
+    d.limbs[SIGNED_LIMBS - 1] = (signed_limb_t)cd;
+    e.limbs[SIGNED_LIMBS - 1] = (signed_limb_t)ce;
+}
+
+/** Apply matrix t/2^SIGNED_LIMB_SIZE to vector (f,g).
+ *
+ * The matrix t must be chosen such that t*(f,g) results in multiples of 2^SIGNED_LIMB_SIZE.
+ * This is the case for matrices computed by ComputeDivstepMatrix().
+ */
+inline void UpdateFG(Num3072Signed& f, Num3072Signed& g, const SignedMatrix& t, int len)
+{
+    const signed_limb_t u = t.u, v=t.v, q=t.q, r=t.r;
+
+    signed_limb_t fi, gi;
+    signed_double_limb_t cf, cg;
+    /* Start computing t*[f,g]. */
+    fi = f.limbs[0];
+    gi = g.limbs[0];
+    cf = (signed_double_limb_t)u * fi + (signed_double_limb_t)v * gi;
+    cg = (signed_double_limb_t)q * fi + (signed_double_limb_t)r * gi;
+    /* Verify that the bottom SIGNED_LIMB_BITS bits of the result are zero, and then throw them away. */
+    Assume((cf & MAX_SIGNED_LIMB) == 0);
+    Assume((cg & MAX_SIGNED_LIMB) == 0);
+    cf >>= SIGNED_LIMB_SIZE;
+    cg >>= SIGNED_LIMB_SIZE;
+    /* Now iteratively compute limb i=1..SIGNED_LIMBS-1 of t*[f,g], and store them in output limb i-1 (shifting
+     * down by SIGNED_LIMB_BITS bits). */
+    for (int i = 1; i < len; ++i) {
+        fi = f.limbs[i];
+        gi = g.limbs[i];
+        cf += (signed_double_limb_t)u * fi + (signed_double_limb_t)v * gi;
+        cg += (signed_double_limb_t)q * fi + (signed_double_limb_t)r * gi;
+        f.limbs[i - 1] = (signed_limb_t)cf & MAX_SIGNED_LIMB; cf >>= SIGNED_LIMB_SIZE;
+        g.limbs[i - 1] = (signed_limb_t)cg & MAX_SIGNED_LIMB; cg >>= SIGNED_LIMB_SIZE;
+    }
+    /* What remains is limb SIGNED_LIMBS of t*[f,g]; store it as output limb SIGNED_LIMBS-1. */
+    f.limbs[len - 1] = (signed_limb_t)cf;
+    g.limbs[len - 1] = (signed_limb_t)cg;
+
+}
+} // namespace
+
+Num3072 Num3072::GetInverse() const
+{
+    // Compute a modular inverse based on a variant of the safegcd algorithm:
+    // - Paper: https://gcd.cr.yp.to/papers.html
+    // - Inspired by this code in libsecp256k1:
+    //   https://github.com/bitcoin-core/secp256k1/blob/master/src/modinv32_impl.h
+    // - Explanation of the algorithm:
+    //   https://github.com/bitcoin-core/secp256k1/blob/master/doc/safegcd_implementation.md
+
+    // Local variables d, e, f, g:
+    // - f and g are the variables whose gcd we compute (despite knowing the answer is 1):
+    //   - f is always odd, and initialized as modulus
+    //   - g is initialized as *this (called x in what follows)
+    // - d and e are the numbers for which at every step it is the case that:
+    //   - f = d * x mod modulus; d is initialized as 0
+    //   - g = e * x mod modulus; e is initialized as 1
+    Num3072Signed d, e, f, g;
+    e.limbs[0] = 1;
+    // F is initialized as modulus, which in signed limb representation can be expressed
+    // simply as 2^3072 + -MAX_PRIME_DIFF.
+    f.limbs[0] = -MAX_PRIME_DIFF;
+    f.limbs[FINAL_LIMB_POSITION] = ((limb_t)1) << FINAL_LIMB_MODULUS_BITS;
+    g.FromNum3072(*this);
+    int len = SIGNED_LIMBS; //!< The number of significant limbs in f and g
+    signed_limb_t eta = -1; //!< State to track knowledge about ratio of f and g
+    // Perform divsteps on [f,g] until g=0 is reached, keeping (d,e) synchronized with them.
+    while (true) {
+        // Compute transformation matrix t that represents the next SIGNED_LIMB_SIZE divsteps
+        // to apply. This can be computed from just the bottom limb of f and g, and eta.
+        SignedMatrix t;
+        eta = ComputeDivstepMatrix(eta, f.limbs[0], g.limbs[0], t);
+        // Apply that transformation matrix to the full [f,g] vector.
+        UpdateFG(f, g, t, len);
+        // Apply that transformation matrix to the full [d,e] vector (mod modulus).
+        UpdateDE(d, e, t);
+
+        // Check if g is zero.
+        if (g.limbs[0] == 0) {
+            signed_limb_t cond = 0;
+            for (int j = 1; j < len; ++j) {
+                cond |= g.limbs[j];
+            }
+            // If so, we're done.
+            if (cond == 0) break;
+        }
+
+        // Check if the top limbs of both f and g are both 0 or -1.
+        signed_limb_t fn = f.limbs[len - 1], gn = g.limbs[len - 1];
+        signed_limb_t cond = ((signed_limb_t)len - 2) >> (LIMB_SIZE - 1);
+        cond |= fn ^ (fn >> (LIMB_SIZE - 1));
+        cond |= gn ^ (gn >> (LIMB_SIZE - 1));
+        if (cond == 0) {
+            // If so, drop the top limb, shrinking the size of f and g, by
+            // propagating the sign to the previous limb.
+            f.limbs[len - 2] |= (limb_t)f.limbs[len - 1] << SIGNED_LIMB_SIZE;
+            g.limbs[len - 2] |= (limb_t)g.limbs[len - 1] << SIGNED_LIMB_SIZE;
+            --len;
+        }
+    }
+    // At some point, [f,g] will have been rewritten into [f',0], such that gcd(f,g) = gcd(f',0).
+    // This is proven in the paper. As f started out being modulus, a prime number, we know that
+    // gcd is 1, and thus f' is 1 or -1.
+    Assume((f.limbs[0] & MAX_SIGNED_LIMB) == 1 || (f.limbs[0] & MAX_SIGNED_LIMB) == MAX_SIGNED_LIMB);
+    // As we've maintained the invariant that f = d * x mod modulus, we get d/f mod modulus is the
+    // modular inverse of x we're looking for. As f is 1 or -1, it is also true that d/f = d*f.
+    // Normalize d to prepare it for output, while negating it if f is negative.
+    d.Normalize(f.limbs[len - 1] >> (LIMB_SIZE  - 1));
+    Num3072 ret;
+    d.ToNum3072(ret);
+    return ret;
 }
 
 void Num3072::Multiply(const Num3072& a)
@@ -215,43 +489,6 @@ void Num3072::Multiply(const Num3072& a)
     if (c0) this->FullReduce();
 }
 
-void Num3072::Square()
-{
-    limb_t c0 = 0, c1 = 0, c2 = 0;
-    Num3072 tmp;
-
-    /* Compute limbs 0..N-2 of this*this into tmp, including one reduction. */
-    for (int j = 0; j < LIMBS - 1; ++j) {
-        limb_t d0 = 0, d1 = 0, d2 = 0;
-        for (int i = 0; i < (LIMBS - 1 - j) / 2; ++i) muldbladd3(d0, d1, d2, this->limbs[i + j + 1], this->limbs[LIMBS - 1 - i]);
-        if ((j + 1) & 1) muladd3(d0, d1, d2, this->limbs[(LIMBS - 1 - j) / 2 + j + 1], this->limbs[LIMBS - 1 - (LIMBS - 1 - j) / 2]);
-        mulnadd3(c0, c1, c2, d0, d1, d2, MAX_PRIME_DIFF);
-        for (int i = 0; i < (j + 1) / 2; ++i) muldbladd3(c0, c1, c2, this->limbs[i], this->limbs[j - i]);
-        if ((j + 1) & 1) muladd3(c0, c1, c2, this->limbs[(j + 1) / 2], this->limbs[j - (j + 1) / 2]);
-        extract3(c0, c1, c2, tmp.limbs[j]);
-    }
-
-    assert(c2 == 0);
-    for (int i = 0; i < LIMBS / 2; ++i) muldbladd3(c0, c1, c2, this->limbs[i], this->limbs[LIMBS - 1 - i]);
-    extract3(c0, c1, c2, tmp.limbs[LIMBS - 1]);
-
-    /* Perform a second reduction. */
-    muln2(c0, c1, MAX_PRIME_DIFF);
-    for (int j = 0; j < LIMBS; ++j) {
-        addnextract2(c0, c1, tmp.limbs[j], this->limbs[j]);
-    }
-
-    assert(c1 == 0);
-    assert(c0 == 0 || c0 == 1);
-
-    /* Perform up to two more reductions if the internal state has already
-     * overflown the MAX of Num3072 or if it is larger than the modulus or
-     * if both are the case.
-     * */
-    if (this->IsOverflow()) this->FullReduce();
-    if (c0) this->FullReduce();
-}
-
 void Num3072::SetToOne()
 {
     this->limbs[0] = 1;
diff --git a/src/crypto/muhash.h b/src/crypto/muhash.h
index 222b866b6d..798df57372 100644
--- a/src/crypto/muhash.h
+++ b/src/crypto/muhash.h
@@ -22,14 +22,22 @@ public:
 
 #ifdef __SIZEOF_INT128__
     typedef unsigned __int128 double_limb_t;
+    typedef signed __int128 signed_double_limb_t;
     typedef uint64_t limb_t;
+    typedef int64_t signed_limb_t;
     static constexpr int LIMBS = 48;
+    static constexpr int SIGNED_LIMBS = 50;
     static constexpr int LIMB_SIZE = 64;
+    static constexpr int SIGNED_LIMB_SIZE = 62;
 #else
     typedef uint64_t double_limb_t;
+    typedef int64_t signed_double_limb_t;
     typedef uint32_t limb_t;
+    typedef int32_t signed_limb_t;
     static constexpr int LIMBS = 96;
+    static constexpr int SIGNED_LIMBS = 103;
     static constexpr int LIMB_SIZE = 32;
+    static constexpr int SIGNED_LIMB_SIZE = 30;
 #endif
     limb_t limbs[LIMBS];
 
@@ -37,6 +45,8 @@ public:
     static_assert(LIMB_SIZE * LIMBS == 3072, "Num3072 isn't 3072 bits");
     static_assert(sizeof(double_limb_t) == sizeof(limb_t) * 2, "bad size for double_limb_t");
     static_assert(sizeof(limb_t) * 8 == LIMB_SIZE, "LIMB_SIZE is incorrect");
+    static_assert(SIGNED_LIMB_SIZE * SIGNED_LIMBS > 3072, "SIGNED_LIMBS * SIGNED_LIMB_SIZE is too small");
+    static_assert(3072 / SIGNED_LIMB_SIZE == SIGNED_LIMBS - 1, "Bit 3072 must land in top signed limb");
 
     // Hard coded values in MuHash3072 constructor and Finalize
     static_assert(sizeof(limb_t) == 4 || sizeof(limb_t) == 8, "bad size for limb_t");
@@ -44,7 +54,6 @@ public:
     void Multiply(const Num3072& a);
     void Divide(const Num3072& a);
     void SetToOne();
-    void Square();
     void ToBytes(unsigned char (&out)[BYTE_SIZE]);
 
     Num3072() { this->SetToOne(); };
diff --git a/src/test/fuzz/muhash.cpp b/src/test/fuzz/muhash.cpp
index e74bcb962f..f1dd5ebaf9 100644
--- a/src/test/fuzz/muhash.cpp
+++ b/src/test/fuzz/muhash.cpp
@@ -2,13 +2,169 @@
 // Distributed under the MIT software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 
+#include <arith_uint256.h>
 #include <crypto/muhash.h>
+#include <span.h>
+#include <uint256.h>
 #include <test/fuzz/FuzzedDataProvider.h>
 #include <test/fuzz/fuzz.h>
 #include <test/fuzz/util.h>
 
+#include <algorithm>
+#include <array>
 #include <vector>
 
+namespace {
+
+/** Class to represent 6144-bit numbers using arith_uint256 code.
+ *
+ * 6144 is sufficient to represent the product of two 3072-bit numbers. */
+class arith_uint6144 : public base_uint<6144> {
+public:
+    arith_uint6144(uint64_t x) : base_uint{x} {}
+
+    /** Construct an arith_uint6144 from any multiple of 4 bytes in LE notation,
+     *  up to 768 bytes. */
+    arith_uint6144(Span<const uint8_t> bytes) : base_uint{}
+    {
+        assert(bytes.size() % 4 == 0);
+        assert(bytes.size() <= 768);
+        for (unsigned i = 0; i * 4 < bytes.size(); ++i) {
+            pn[i] = ReadLE32(bytes.data() + 4 * i);
+        }
+    }
+
+    /** Serialize an arithm_uint6144 to any multiply of 4 bytes in LE notation,
+     *  on the condition that the represented number fits. */
+    void Serialize(Span<uint8_t> bytes) {
+        assert(bytes.size() % 4 == 0);
+        assert(bytes.size() <= 768);
+        for (unsigned i = 0; i * 4 < bytes.size(); ++i) {
+            WriteLE32(bytes.data() + 4 * i, pn[i]);
+        }
+        for (unsigned i = bytes.size() / 4; i * 4 < 768; ++i) {
+            assert(pn[i] == 0);
+        }
+    };
+};
+
+/** The MuHash3072 modulus (2**3072 - 1103717) as 768 LE8 bytes. */
+constexpr std::array<const uint8_t, 768> MODULUS_BYTES = {
+    155,  40, 239, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+const arith_uint6144 ZERO{0};
+const arith_uint6144 ONE{1};
+const arith_uint6144 MODULUS{MODULUS_BYTES};
+
+/** Update value to be the modulus of the input modulo MODULUS. */
+void Reduce(arith_uint6144& value)
+{
+    arith_uint6144 tmp = value;
+    tmp /= MODULUS;
+    tmp *= MODULUS;
+    value -= tmp;
+}
+
+} // namespace
+
+FUZZ_TARGET(num3072_mul)
+{
+    // Test multiplication
+    FuzzedDataProvider provider{buffer.data(), buffer.size()};
+
+    // Read two 3072-bit numbers from fuzz input, and construct arith_uint6144
+    // and Num3072 objects with the read values.
+    uint16_t data_a_len = provider.ConsumeIntegralInRange(0, 384);
+    uint8_t data_a[384] = {0};
+    provider.ConsumeData(data_a, data_a_len);
+    arith_uint6144 a_uint{data_a};
+    Num3072 a_num{data_a};
+
+    uint8_t data_b[384] = {0};
+    provider.ConsumeData(data_b, 384);
+    arith_uint6144 b_uint{data_b};
+    Num3072 b_num{data_b};
+
+    // Multiply the first number with the second, in both representations.
+    a_num.Multiply(b_num);
+    a_uint *= b_uint;
+    Reduce(a_uint);
+
+    // Serialize both to bytes and compare.
+    uint8_t buf_num[384], buf_uint[384];
+    a_num.ToBytes(buf_num);
+    a_uint.Serialize(buf_uint);
+    assert(std::ranges::equal(buf_num, buf_uint));
+}
+
+FUZZ_TARGET(num3072_inv)
+{
+    // Test inversion
+
+    FuzzedDataProvider provider{buffer.data(), buffer.size()};
+
+    // Read a 3072-bit number from fuzz input, and construct arith_uint6144
+    // and Num3072 objects with the read values.
+    uint8_t data[384] = {0};
+    provider.ConsumeData(data, 384);
+    Num3072 num{data};
+    arith_uint6144 uint{data};
+
+    // Bail out if the number has no inverse.
+    if ((uint == ZERO) || (uint == MODULUS)) return;
+
+    // Compute the inverse of the Num3072 object.
+    Num3072 inv;
+    inv.SetToOne();
+    inv.Divide(num);
+
+    // Convert the computed inverse to arith_uint6144.
+    uint8_t buf[384];
+    inv.ToBytes(buf);
+    arith_uint6144 uint_inv{buf};
+
+    // Multiply the original and the inverse, and expect 1.
+    uint *= uint_inv;
+    Reduce(uint);
+    assert(uint == ONE);
+}
+
 FUZZ_TARGET(muhash)
 {
     FuzzedDataProvider fuzzed_data_provider{buffer.data(), buffer.size()};