diff --git a/ethosu/regor/common/scaling.cpp b/ethosu/regor/common/scaling.cpp index 1e5d4d0f0da6c4f363ecda5097eaedb789752d4b..1538a5bc64c9828db9299446b4dffbb7a5e433f1 100644 --- a/ethosu/regor/common/scaling.cpp +++ b/ethosu/regor/common/scaling.cpp @@ -36,13 +36,17 @@ bool QuantizedScale::operator!=(const QuantizedScale &other) const QuantizedScale::QuantizedScale(double scale_, bool reduced) { int exponent = 0; - int leftShift = reduced ? 15 : 31; double significand = std::frexp(scale_, &exponent); // convert from left to right-shift - scale = int32_t(std::round(significand * double(1LL << leftShift))); - // make sure reduced scale does not overflow - if ( reduced ) scale = ClampToType(scale); - shift = leftShift - exponent; + scale = int32_t(std::round(significand * double(1LL << 31))); + shift = 31 - exponent; + if ( reduced ) + { + scale = (scale >> 16) + (scale >> 15 & 1); + // make sure reduced scale does not overflow + scale = std::min(scale, 0x7FFF); + shift -= 16; + } // if shift is out of bounds [0,63], try to get back within bounds if ( shift > 63 ) {