Initial commit

2025-12-17 16:47:48 +00:00
commit 13813f3363
4964 changed files with 1079753 additions and 0 deletions
--- a/engine/core/src/simd/simd_common.inl
+++ b/engine/core/src/simd/simd_common.inl
@@ -0,0 +1,154 @@
+#pragma once
+
+// SIMD common platform independent implementation
+
+#include "simd/simd.hpp"
+
+namespace Aya
+{
+
+namespace simd
+{
+
+namespace details
+{
+AYA_SIMD_INLINE v4f inverseEstimate0Precision()
+{
+    return splat(3e-04f);
+}
+
+AYA_SIMD_INLINE v4f inverseEstimate1Precision()
+{
+    return splat(2e-07f);
+}
+
+AYA_SIMD_INLINE v4f inverseSqrtEstimate0Precision()
+{
+    return splat(3.3e-05f);
+}
+
+AYA_SIMD_INLINE v4f inverseSqrtEstimate1Precision()
+{
+    return splat(3e-07f);
+}
+} // namespace details
+
+AYA_SIMD_INLINE v4f sumAcross2(v4fArg a, v4fArg b, v4fArg c)
+{
+    return sumAcross2(a, b, c, c);
+}
+
+AYA_SIMD_INLINE v4f sumAcross3(v4fArg a, v4fArg b, v4fArg c)
+{
+    return sumAcross3(a, b, c, c);
+}
+
+AYA_SIMD_INLINE v4f sumAcross4(v4fArg a, v4fArg b, v4fArg c)
+{
+    return sumAcross4(a, b, c, c);
+}
+
+AYA_SIMD_INLINE v4f sumAcross2(v4fArg a, v4fArg b, v4fArg c, v4fArg d)
+{
+    v4f a0c0a1c1 = zipLow(a, c);
+    v4f b0d0b1d1 = zipLow(b, d);
+    v4f a0b0c0d0, a1b1c1d1;
+    zip(a0b0c0d0, a1b1c1d1, a0c0a1c1, b0d0b1d1);
+    v4f sum = a0b0c0d0 + a1b1c1d1;
+    return sum;
+}
+
+AYA_SIMD_INLINE v4f sumAcross3(v4fArg a, v4fArg b, v4fArg c, v4fArg d)
+{
+    v4f a0c0a1c1, a2c2xxxx;
+    zip(a0c0a1c1, a2c2xxxx, a, c);
+    v4f b0d0b1d1, b2d2xxxx;
+    zip(b0d0b1d1, b2d2xxxx, b, d);
+    v4f a0b0c0d0, a1b1c1d1;
+    zip(a0b0c0d0, a1b1c1d1, a0c0a1c1, b0d0b1d1);
+    v4f sum = a0b0c0d0 + a1b1c1d1;
+    v4f a2b2c2d2 = zipLow(a2c2xxxx, b2d2xxxx);
+    sum = sum + a2b2c2d2;
+    return sum;
+}
+
+AYA_SIMD_INLINE v4f sumAcross4(v4fArg a, v4fArg b, v4fArg c, v4fArg d)
+{
+    v4f a0c0a1c1, a2c2a3c3;
+    zip(a0c0a1c1, a2c2a3c3, a, c);
+    v4f b0d0b1d1, b2d2b3d3;
+    zip(b0d0b1d1, b2d2b3d3, b, d);
+    v4f a0b0c0d0, a1b1c1d1;
+    zip(a0b0c0d0, a1b1c1d1, a0c0a1c1, b0d0b1d1);
+    v4f sum = a0b0c0d0 + a1b1c1d1;
+    v4f a2b2c2d2, a3b3c3d3;
+    zip(a2b2c2d2, a3b3c3d3, a2c2a3c3, b2d2b3d3);
+    sum = sum + (a2b2c2d2 + a3b3c3d3);
+    return sum;
+}
+
+template<class T>
+AYA_SIMD_INLINE void transpose(T& a, T& b, T& c, T& d, const T& x, const T& y, const T& z, const T& w)
+{
+    T x0z0x1z1, x2z2x3z3;
+    zip(x0z0x1z1, x2z2x3z3, x, z);
+    T y0w0y1w1, y2w2y3w3;
+    zip(y0w0y1w1, y2w2y3w3, y, w);
+
+    zip(a, b, x0z0x1z1, y0w0y1w1);
+    zip(c, d, x2z2x3z3, y2w2y3w3);
+}
+
+template<class T>
+AYA_SIMD_INLINE void transpose4x3(T& x, T& y, T& z, const T& a, const T& b, const T& c, const T& d)
+{
+    T dummy;
+    transpose(x, y, z, dummy, a, b, c, d);
+}
+
+template<class T>
+AYA_SIMD_INLINE void transpose3x4(T& a, T& b, T& c, T& d, const T& x, const T& y, const T& z)
+{
+    transpose(a, b, c, d, x, y, z, z);
+}
+
+template<class T>
+AYA_SIMD_INLINE void transpose4x2(T& x, T& y, const T& a, const T& b, const T& c, const T& d)
+{
+    T t0 = zipLow(a, c);
+    T t1 = zipLow(b, d);
+    zip(x, y, t0, t1);
+}
+
+template<class T>
+AYA_SIMD_INLINE void transpose2x4(T& a, T& b, T& c, T& d, const T& x, const T& y)
+{
+    zip(a, c, x, y);
+    b = moveHighLow(a, a);
+    d = moveHighLow(c, c);
+}
+
+template<class T>
+AYA_SIMD_INLINE T gatherX(const T& a, const T& b, const T& c, const T& d)
+{
+    T t0 = zipLow(a, c);
+    T t1 = zipLow(b, d);
+    return zipLow(t0, t1);
+}
+
+template<class T>
+AYA_SIMD_INLINE T gatherX(const T& a, const T& b, const T& c)
+{
+    T t0 = zipLow(a, c);
+    return zipLow(t0, b);
+}
+
+template<class T>
+AYA_SIMD_INLINE T gatherX(const T& a, const T& b)
+{
+    return zipLow(a, b);
+}
+
+} // namespace simd
+
+} // namespace Aya