mirror of
https://github.com/zebrajr/opencv.git
synced 2026-01-15 12:15:17 +00:00
Merge pull request #28211 from pratham-mcw:fix/neon-dotprod-expand-overflow
NEON: fix dot product accumulation causing NORM test failures on Windows ARM64
This commit is contained in:
@@ -888,9 +888,10 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
|
||||
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
|
||||
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
|
||||
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
|
||||
int32x4_t s0 = vaddl_s16(vget_low_s16(p0), vget_low_s16(p1));
|
||||
return v_int32x4(vaddq_s32(s0, vaddl_s16(vget_high_s16(p0), vget_high_s16(p1))));
|
||||
}
|
||||
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user