Files
aya/engine/core/src/simd/simd_common.inl
2025-12-17 16:47:48 +00:00

155 lines
3.3 KiB
C++

#pragma once
// SIMD common platform independent implementation
#include "simd/simd.hpp"
namespace Aya
{
namespace simd
{
namespace details
{
AYA_SIMD_INLINE v4f inverseEstimate0Precision()
{
return splat(3e-04f);
}
AYA_SIMD_INLINE v4f inverseEstimate1Precision()
{
return splat(2e-07f);
}
AYA_SIMD_INLINE v4f inverseSqrtEstimate0Precision()
{
return splat(3.3e-05f);
}
AYA_SIMD_INLINE v4f inverseSqrtEstimate1Precision()
{
return splat(3e-07f);
}
} // namespace details
AYA_SIMD_INLINE v4f sumAcross2(v4fArg a, v4fArg b, v4fArg c)
{
return sumAcross2(a, b, c, c);
}
AYA_SIMD_INLINE v4f sumAcross3(v4fArg a, v4fArg b, v4fArg c)
{
return sumAcross3(a, b, c, c);
}
AYA_SIMD_INLINE v4f sumAcross4(v4fArg a, v4fArg b, v4fArg c)
{
return sumAcross4(a, b, c, c);
}
AYA_SIMD_INLINE v4f sumAcross2(v4fArg a, v4fArg b, v4fArg c, v4fArg d)
{
v4f a0c0a1c1 = zipLow(a, c);
v4f b0d0b1d1 = zipLow(b, d);
v4f a0b0c0d0, a1b1c1d1;
zip(a0b0c0d0, a1b1c1d1, a0c0a1c1, b0d0b1d1);
v4f sum = a0b0c0d0 + a1b1c1d1;
return sum;
}
AYA_SIMD_INLINE v4f sumAcross3(v4fArg a, v4fArg b, v4fArg c, v4fArg d)
{
v4f a0c0a1c1, a2c2xxxx;
zip(a0c0a1c1, a2c2xxxx, a, c);
v4f b0d0b1d1, b2d2xxxx;
zip(b0d0b1d1, b2d2xxxx, b, d);
v4f a0b0c0d0, a1b1c1d1;
zip(a0b0c0d0, a1b1c1d1, a0c0a1c1, b0d0b1d1);
v4f sum = a0b0c0d0 + a1b1c1d1;
v4f a2b2c2d2 = zipLow(a2c2xxxx, b2d2xxxx);
sum = sum + a2b2c2d2;
return sum;
}
AYA_SIMD_INLINE v4f sumAcross4(v4fArg a, v4fArg b, v4fArg c, v4fArg d)
{
v4f a0c0a1c1, a2c2a3c3;
zip(a0c0a1c1, a2c2a3c3, a, c);
v4f b0d0b1d1, b2d2b3d3;
zip(b0d0b1d1, b2d2b3d3, b, d);
v4f a0b0c0d0, a1b1c1d1;
zip(a0b0c0d0, a1b1c1d1, a0c0a1c1, b0d0b1d1);
v4f sum = a0b0c0d0 + a1b1c1d1;
v4f a2b2c2d2, a3b3c3d3;
zip(a2b2c2d2, a3b3c3d3, a2c2a3c3, b2d2b3d3);
sum = sum + (a2b2c2d2 + a3b3c3d3);
return sum;
}
template<class T>
AYA_SIMD_INLINE void transpose(T& a, T& b, T& c, T& d, const T& x, const T& y, const T& z, const T& w)
{
T x0z0x1z1, x2z2x3z3;
zip(x0z0x1z1, x2z2x3z3, x, z);
T y0w0y1w1, y2w2y3w3;
zip(y0w0y1w1, y2w2y3w3, y, w);
zip(a, b, x0z0x1z1, y0w0y1w1);
zip(c, d, x2z2x3z3, y2w2y3w3);
}
template<class T>
AYA_SIMD_INLINE void transpose4x3(T& x, T& y, T& z, const T& a, const T& b, const T& c, const T& d)
{
T dummy;
transpose(x, y, z, dummy, a, b, c, d);
}
template<class T>
AYA_SIMD_INLINE void transpose3x4(T& a, T& b, T& c, T& d, const T& x, const T& y, const T& z)
{
transpose(a, b, c, d, x, y, z, z);
}
template<class T>
AYA_SIMD_INLINE void transpose4x2(T& x, T& y, const T& a, const T& b, const T& c, const T& d)
{
T t0 = zipLow(a, c);
T t1 = zipLow(b, d);
zip(x, y, t0, t1);
}
template<class T>
AYA_SIMD_INLINE void transpose2x4(T& a, T& b, T& c, T& d, const T& x, const T& y)
{
zip(a, c, x, y);
b = moveHighLow(a, a);
d = moveHighLow(c, c);
}
template<class T>
AYA_SIMD_INLINE T gatherX(const T& a, const T& b, const T& c, const T& d)
{
T t0 = zipLow(a, c);
T t1 = zipLow(b, d);
return zipLow(t0, t1);
}
template<class T>
AYA_SIMD_INLINE T gatherX(const T& a, const T& b, const T& c)
{
T t0 = zipLow(a, c);
return zipLow(t0, b);
}
template<class T>
AYA_SIMD_INLINE T gatherX(const T& a, const T& b)
{
return zipLow(a, b);
}
} // namespace simd
} // namespace Aya