ARM NEON SIMD support (#564)

* added neon functions * TK1 tests passed * code cleanup * Re-tested on TK1 * improve simd4i * fixed simd4f_bool neon * fix simd8i sse operator <= * restored rsqrt/VSX * fixed simd4f/neon/reciprocal_sqrt

ARM NEON SIMD support (#564)
* added neon functions * TK1 tests passed * code cleanup * Re-tested on TK1 * improve simd4i * fixed simd4f_bool neon * fix simd8i sse operator <= * restored rsqrt/VSX * fixed simd4f/neon/reciprocal_sqrt
5f0c1a1c · Evgeniy Fominov · Davis E. King · 63f4f73b · 5f0c1a1c · 5f0c1a1c
Commit 5f0c1a1c authored Apr 29, 2017 by Evgeniy Fominov Committed by Davis E. King Apr 29, 2017
Show whitespace changes
Inline Side-by-side

Showing with 204 additions and 6 deletions

simd4f.h dlib/simd/simd4f.h +115 -3

simd4i.h dlib/simd/simd4i.h +81 -1

simd_check.h dlib/simd/simd_check.h +8 -2

No files found.
--- a/dlib/simd/simd4f.h
+++ b/dlib/simd/simd4f.h
@@ -135,6 +135,65 @@ namespace dlib

    typedef simd4i simd4f_bool;

+#elif defined(DLIB_HAVE_NEON)
+
+    class simd4f
+    {
+    public:
+        typedef float type;
+
+        inline simd4f() {}
+        inline simd4f(float f) { x = vdupq_n_f32(f); }
+        inline simd4f(float r0, float r1, float r2, float r3)
+        {
+            float __attribute__ ((aligned (16))) data[4] = { r0, r1, r2, r3 };
+            x = vld1q_f32(data);
+        }
+        inline simd4f(const float32x4_t& val):x(val) {}
+        inline simd4f(const simd4i& val):x(vcvtq_f32_s32(val)) {}
+
+        inline simd4f& operator=(const simd4i& val)
+        {
+            x = simd4f(val);
+            return *this;
+        }
+
+        inline simd4f& operator=(const float& val)
+        {
+            x = simd4f(val);
+            return *this;
+        }
+
+        inline simd4f& operator=(const float32x4_t& val)
+        {
+            x = val;
+            return *this;
+        }
+
+        inline operator float32x4_t() const { return x; }
+
+        // truncate to 32bit integers
+        inline operator int32x4_t() const { return vcvtq_s32_f32(x); }
+
+        inline void load_aligned(const type* ptr)  { x = vld1q_f32(ptr); }
+        inline void store_aligned(type* ptr) const { vst1q_f32(ptr, x); }
+        inline void load(const type* ptr)          { x = vld1q_f32(ptr); }
+        inline void store(type* ptr)         const { vst1q_f32(ptr, x); }
+
+        inline unsigned int size() const { return 4; }
+        inline float operator[](unsigned int idx) const
+        {
+            float temp[4];
+            store(temp);
+            return temp[idx];
+        }
+
+    private:
+        float32x4_t x;
+    };
+
+
+    typedef simd4i simd4f_bool;
 #else
    class simd4f
    {
@@ -212,6 +271,7 @@ namespace dlib
        float x[4];
    };

+
    class simd4f_bool
    {
    public:
@@ -224,6 +284,7 @@ namespace dlib
    private:
        bool x[4];
    };
+
 #endif

 // ----------------------------------------------------------------------------------------
@@ -244,6 +305,8 @@ namespace dlib
        return _mm_add_ps(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_add(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vaddq_f32(lhs, rhs);
 #else
        return simd4f(lhs[0]+rhs[0],
                      lhs[1]+rhs[1],
@@ -262,6 +325,8 @@ namespace dlib
        return _mm_sub_ps(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_sub(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vsubq_f32(lhs, rhs);
 #else
        return simd4f(lhs[0]-rhs[0],
                      lhs[1]-rhs[1],
@@ -280,6 +345,8 @@ namespace dlib
        return _mm_mul_ps(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_mul(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vmulq_f32(lhs, rhs);
 #else
        return simd4f(lhs[0]*rhs[0],
                      lhs[1]*rhs[1],
@@ -298,6 +365,12 @@ namespace dlib
        return _mm_div_ps(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_div(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        float32x4_t reciprocal = vrecpeq_f32(rhs);
+        reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(rhs, reciprocal), reciprocal);
+        float32x4_t result = vmulq_f32(lhs,reciprocal);
+        return result;
 #else
        return simd4f(lhs[0]/rhs[0],
                      lhs[1]/rhs[1],
@@ -316,6 +389,8 @@ namespace dlib
        return _mm_cmpeq_ps(lhs, rhs);
 #elif defined(DLIB_HAVE_VSX)
        return vec_cmpeq(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vceqq_f32(lhs, rhs);
 #else
        return simd4f_bool(lhs[0]==rhs[0],
                           lhs[1]==rhs[1],
@@ -330,7 +405,7 @@ namespace dlib
    {
 #ifdef DLIB_HAVE_SSE2
        return _mm_cmpneq_ps(lhs, rhs);
-#elif defined(DLIB_HAVE_VSX)
+#elif defined(DLIB_HAVE_VSX) || defined(DLIB_HAVE_NEON)
        return ~(lhs==rhs);     // simd4f_bool is simd4i typedef, can use ~
 #else
        return simd4f_bool(lhs[0]!=rhs[0],
@@ -348,6 +423,8 @@ namespace dlib
        return _mm_cmplt_ps(lhs, rhs);
 #elif defined(DLIB_HAVE_VSX)
        return vec_cmplt(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vcltq_f32(lhs, rhs);
 #else
        return simd4f_bool(lhs[0]<rhs[0],
                           lhs[1]<rhs[1],
@@ -371,6 +448,8 @@ namespace dlib
        return _mm_cmple_ps(lhs, rhs);
 #elif defined(DLIB_HAVE_VSX)
        return vec_cmple(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vcleq_f32(lhs, rhs);
 #else
        return simd4f_bool(lhs[0]<=rhs[0],
                           lhs[1]<=rhs[1],
@@ -394,6 +473,8 @@ namespace dlib
        return _mm_min_ps(lhs, rhs);
 #elif defined(DLIB_HAVE_VSX)
        return vec_min(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vminq_f32(lhs, rhs);
 #else
        return simd4f(std::min(lhs[0],rhs[0]),
                      std::min(lhs[1],rhs[1]),
@@ -410,6 +491,8 @@ namespace dlib
        return _mm_max_ps(lhs, rhs);
 #elif defined(DLIB_HAVE_VSX)
        return vec_max(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vmaxq_f32(lhs, rhs);
 #else
        return simd4f(std::max(lhs[0],rhs[0]),
                      std::max(lhs[1],rhs[1]),
@@ -426,6 +509,11 @@ namespace dlib
        return _mm_rcp_ps(item);
 #elif defined(DLIB_HAVE_VSX)
        return vec_re(item());
+#elif defined(DLIB_HAVE_NEON)
+        float32x4_t estimate  = vrecpeq_f32(item);
+        estimate  = vmulq_f32(vrecpsq_f32(estimate , item), estimate );
+        estimate  = vmulq_f32(vrecpsq_f32(estimate , item), estimate );
+        return estimate ;
 #else
        return simd4f(1.0f/item[0],
                      1.0f/item[1],
@@ -442,6 +530,11 @@ namespace dlib
        return _mm_rsqrt_ps(item);
 #elif defined(DLIB_HAVE_VSX)
        return vec_rsqrt(item());
+#elif defined(DLIB_HAVE_NEON)
+        float32x4_t estimate = vrsqrteq_f32(item);
+        simd4f estimate2 = vmulq_f32(estimate, item);
+        estimate = vmulq_f32(estimate, vrsqrtsq_f32(estimate2, estimate));
+        return estimate;
 #else
        return simd4f(1.0f/std::sqrt(item[0]),
                      1.0f/std::sqrt(item[1]),
@@ -464,6 +557,9 @@ namespace dlib
        simd4f temp = _mm_add_ps(item,_mm_movehl_ps(item,item));
        simd4f temp2 = _mm_shuffle_ps(temp,temp,1);
        return _mm_cvtss_f32(_mm_add_ss(temp,temp2));
+#elif defined(DLIB_HAVE_NEON)
+        float32x2_t r = vadd_f32(vget_high_f32(item), vget_low_f32(item));
+        return vget_lane_f32(vpadd_f32(r, r), 0);
 #else
        return item[0]+item[1]+item[2]+item[3];
 #endif
@@ -488,6 +584,20 @@ namespace dlib
        return _mm_sqrt_ps(item);
 #elif defined(DLIB_HAVE_VSX)
        return vec_sqrt(item());
+#elif defined(DLIB_HAVE_NEON)
+        float32x4_t q_step_0 = vrsqrteq_f32(item);
+        float32x4_t q_step_parm0 = vmulq_f32(item, q_step_0);
+        float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0);
+        float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0);
+        float32x4_t q_step_parm1 = vmulq_f32(item, q_step_1);
+        float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1);
+        float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1);
+        float32x4_t res3 = vmulq_f32(item, q_step_2);
+
+        // normalize sqrt(0)=0
+        uint32x4_t zcomp = vceqq_f32(vdupq_n_f32(0), item);
+        float32x4_t rcorr = vbslq_f32(zcomp, item, res3);
+        return rcorr;
 #else
        return simd4f(std::sqrt(item[0]),
                      std::sqrt(item[1]),
@@ -502,7 +612,7 @@ namespace dlib
    {
 #ifdef DLIB_HAVE_SSE41
        return _mm_ceil_ps(item);
-#elif defined(DLIB_HAVE_SSE2)
+#elif defined(DLIB_HAVE_SSE2)  || defined(DLIB_HAVE_NEON)
        float temp[4];
        item.store(temp);
        temp[0] = std::ceil(temp[0]);
@@ -528,7 +638,7 @@ namespace dlib
    {
 #ifdef DLIB_HAVE_SSE41
        return _mm_floor_ps(item);
-#elif defined(DLIB_HAVE_SSE2)
+#elif defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_NEON)
        float temp[4];
        item.store(temp);
        temp[0] = std::floor(temp[0]);
@@ -557,6 +667,8 @@ namespace dlib
        return _mm_blendv_ps(b,a,cmp);
 #elif defined(DLIB_HAVE_SSE2)
        return _mm_or_ps(_mm_and_ps(cmp,a) , _mm_andnot_ps(cmp,b));
+#elif defined(DLIB_HAVE_NEON)
+        return vbslq_f32(cmp, a, b);
 #else
        return simd4f(cmp[0]?a[0]:b[0],
                      cmp[1]?a[1]:b[1],

--- a/dlib/simd/simd4i.h
+++ b/dlib/simd/simd4i.h
@@ -90,6 +90,49 @@ namespace dlib

    };

+#elif defined(DLIB_HAVE_NEON)
+
+    class simd4i
+    {
+    public:
+        typedef int32 type;
+
+        inline simd4i() {}
+        inline simd4i(int32 f) { x = vdupq_n_s32(f); }
+        inline simd4i(int32 r0, int32 r1, int32 r2, int32 r3)
+        {
+            int32 __attribute__((aligned(16))) data[4] = { r0, r1, r2, r3 };
+            x = vld1q_s32(data);
+        }
+        inline simd4i(const int32x4_t& val):x(val) {}
+        inline simd4i(const uint32x4_t& val):x((int32x4_t)val) {}
+
+        inline simd4i& operator=(const int32x4_t& val)
+        {
+            x = val;
+            return *this;
+        }
+
+        inline operator int32x4_t() const { return x; }
+        inline operator uint32x4_t() const { return (uint32x4_t)x; }
+
+        inline void load_aligned(const type* ptr)  { x = vld1q_s32(ptr); }
+        inline void store_aligned(type* ptr) const { vst1q_s32(ptr, x); }
+        inline void load(const type* ptr)          { x = vld1q_s32(ptr); }
+        inline void store(type* ptr)         const { vst1q_s32(ptr, x); }
+
+        inline unsigned int size() const { return 4; }
+        inline int32 operator[](unsigned int idx) const
+        {
+            int32 temp[4];
+            store(temp);
+            return temp[idx];
+        }
+
+    private:
+        int32x4_t x;
+    };
+
 #else

    class simd4i
@@ -165,6 +208,8 @@ namespace dlib
        return _mm_add_epi32(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_add(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vaddq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]+rhs[0],
                      lhs[1]+rhs[1],
@@ -183,6 +228,8 @@ namespace dlib
        return _mm_sub_epi32(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_sub(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vsubq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]-rhs[0],
                      lhs[1]-rhs[1],
@@ -210,6 +257,8 @@ namespace dlib
        vector int a = lhs(), b = rhs();
        asm("vmuluwm %0, %0, %1\n\t" : "+&v" (a) : "v" (b) );
        return simd4i(a);
+#elif defined(DLIB_HAVE_NEON)
+        return vmulq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]*rhs[0],
                      lhs[1]*rhs[1],
@@ -228,6 +277,8 @@ namespace dlib
        return _mm_and_si128(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_and(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vandq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]&rhs[0],
                      lhs[1]&rhs[1],
@@ -246,6 +297,8 @@ namespace dlib
        return _mm_or_si128(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_or(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vorrq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]|rhs[0],
                      lhs[1]|rhs[1],
@@ -264,6 +317,8 @@ namespace dlib
        return _mm_xor_si128(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_xor(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return veorq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]^rhs[0],
                      lhs[1]^rhs[1],
@@ -282,6 +337,8 @@ namespace dlib
        return _mm_xor_si128(lhs, _mm_set1_epi32(0xFFFFFFFF)); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_xor(lhs(), vec_splats(~0));
+#elif defined(DLIB_HAVE_NEON)
+        return vmvnq_s32(lhs);
 #else
        return simd4i(~lhs[0],
                      ~lhs[1],
@@ -298,6 +355,8 @@ namespace dlib
        return _mm_sll_epi32(lhs,_mm_cvtsi32_si128(rhs));
 #elif defined(DLIB_HAVE_VSX)
        return vec_sl(lhs(), vec_splats((uint32_t)rhs));         
+#elif defined(DLIB_HAVE_NEON)
+        return vshlq_s32(lhs, simd4i(rhs));
 #else
        return simd4i(lhs[0]<<rhs,
                      lhs[1]<<rhs,
@@ -316,6 +375,12 @@ namespace dlib
        return _mm_sra_epi32(lhs,_mm_cvtsi32_si128(rhs));
 #elif defined(DLIB_HAVE_VSX)
        return vec_sr(lhs(), vec_splats((uint32_t)rhs)); 
+#elif defined(DLIB_HAVE_NEON)
+        int32 _lhs[4]; lhs.store(_lhs);
+        return simd4i(_lhs[0]>>rhs,
+                      _lhs[1]>>rhs,
+                      _lhs[2]>>rhs,
+                      _lhs[3]>>rhs);
 #else
        return simd4i(lhs[0]>>rhs,
                      lhs[1]>>rhs,
@@ -334,6 +399,8 @@ namespace dlib
        return _mm_cmpeq_epi32(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_cmpeq(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return (int32x4_t)vceqq_s32(lhs,rhs);
 #else
        return simd4i(lhs[0]==rhs[0] ? 0xFFFFFFFF : 0,
                      lhs[1]==rhs[1] ? 0xFFFFFFFF : 0,
@@ -346,7 +413,7 @@ namespace dlib

    inline simd4i operator!= (const simd4i& lhs, const simd4i& rhs) 
    { 
-#if defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_VSX)
+#if defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_VSX) || defined(DLIB_HAVE_NEON)
        return ~(lhs==rhs);
 #else
        return simd4i(lhs[0]!=rhs[0] ? 0xFFFFFFFF : 0,
@@ -364,6 +431,8 @@ namespace dlib
        return _mm_cmplt_epi32(lhs, rhs); 
 #elif defined(DLIB_HAVE_VSX)
        return vec_cmplt(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return (int32x4_t)vcltq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]<rhs[0] ? 0xFFFFFFFF : 0,
                      lhs[1]<rhs[1] ? 0xFFFFFFFF : 0,
@@ -385,6 +454,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return ~(lhs > rhs); 
+#elif defined(DLIB_HAVE_NEON)
+        return (int32x4_t)vcleq_s32(lhs, rhs);
 #else
        return simd4i(lhs[0]<=rhs[0] ? 0xFFFFFFFF : 0,
                      lhs[1]<=rhs[1] ? 0xFFFFFFFF : 0,
@@ -415,6 +486,8 @@ namespace dlib
                      std::min(_lhs[3],_rhs[3]));
 #elif defined(DLIB_HAVE_VSX)
        return vec_min(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return (int32x4_t)vminq_s32(lhs, rhs);
 #else
        return simd4i(std::min(lhs[0],rhs[0]),
                      std::min(lhs[1],rhs[1]),
@@ -438,6 +511,8 @@ namespace dlib
                      std::max(_lhs[3],_rhs[3]));
 #elif defined(DLIB_HAVE_VSX)
        return vec_max(lhs(), rhs());
+#elif defined(DLIB_HAVE_NEON)
+        return vmaxq_s32(lhs, rhs);
 #else
        return simd4i(std::max(lhs[0],rhs[0]),
                      std::max(lhs[1],rhs[1]),
@@ -458,6 +533,9 @@ namespace dlib
        int32 temp[4];
        item.store(temp);
        return temp[0]+temp[1]+temp[2]+temp[3];
+#elif defined(DLIB_HAVE_NEON)
+        int32x2_t r = vadd_s32(vget_high_s32(item), vget_low_s32(item));
+        return vget_lane_s32(vpadd_s32(r, r), 0);
 #else
        return item[0]+item[1]+item[2]+item[3];
 #endif
@@ -474,6 +552,8 @@ namespace dlib
        return ((cmp&a) | _mm_andnot_si128(cmp,b));
 #elif defined(DLIB_HAVE_VSX)
        return vec_sel(b(), a(), cmp.to_bool());
+#elif defined(DLIB_HAVE_NEON)
+        return vbslq_s32(cmp, a, b);
 #else
        return ((cmp&a) | (~cmp&b));
 #endif

--- a/dlib/simd/simd_check.h
+++ b/dlib/simd/simd_check.h
@@ -66,7 +66,11 @@
                #define DLIB_HAVE_POWER_VEC
            #endif
        #endif
-
+        #ifdef __ARM_NEON
+            #ifndef DLIB_HAVE_NEON
+                #define DLIB_HAVE_NEON
+            #endif
+        #endif
    #endif
 #endif

@@ -97,7 +101,9 @@
    #include <immintrin.h> // AVX
 //    #include <avx2intrin.h>
 #endif
-
+#ifdef DLIB_HAVE_NEON
+    #include <arm_neon.h> // ARM NEON
+#endif


 #endif // DLIB_SIMd_CHECK_Hh_