GCC/Clang compatible SIMD code ./dlib/simd/simd**_vec.h (#414)

* GCC/Clang compatible vector extension SIMD code * Minimal modifications to dlib for the simd_vec code to work, a few include changes and ifdefs * Changed tabbing to spaces * Allow type inference to binary ops on different types of same size * Added cmake option USE_AUTO_VECTOR, and fixed up preprocessor checks. It is required to build with gcc/clang auto vectorization * Changed to intrinsic version due to poor auto vectorization results. The simd8*_vec are just copies of the C code right now. * Removed _vec variants, added to existing defines. simd_check.h back in place and removed from dlib/simd.h

GCC/Clang compatible SIMD code ./dlib/simd/simd**_vec.h (#414)
* GCC/Clang compatible vector extension SIMD code * Minimal modifications to dlib for the simd_vec code to work, a few include changes and ifdefs * Changed tabbing to spaces * Allow type inference to binary ops on different types of same size * Added cmake option USE_AUTO_VECTOR, and fixed up preprocessor checks. It is required to build with gcc/clang auto vectorization * Changed to intrinsic version due to poor auto vectorization results. The simd8*_vec are just copies of the C code right now. * Removed _vec variants, added to existing defines. simd_check.h back in place and removed from dlib/simd.h
60092335 · David Miller · Davis E. King · f6ece5d2 · 60092335 · 60092335
Commit 60092335 authored Feb 01, 2017 by David Miller Committed by Davis E. King Feb 01, 2017
6 changed files
--- a/dlib/cmake
+++ b/dlib/cmake
@@ -32,9 +32,10 @@ endif()
 set(gcc_like_compilers GNU Clang  Intel)
 set(intel_archs x86_64 i386 i686)
 # Setup some options to allow a user to enable SSE and AVX instruction use.  
 if ((";${gcc_like_compilers};" MATCHES ";${CMAKE_CXX_COMPILER_ID};")  AND
-    (";${intel_archs};"        MATCHES ";${CMAKE_SYSTEM_PROCESSOR};"))
+    (";${intel_archs};"        MATCHES ";${CMAKE_SYSTEM_PROCESSOR};") AND NOT USE_AUTO_VECTOR)
    option(USE_SSE2_INSTRUCTIONS "Compile your program with SSE2 instructions" OFF)
    option(USE_SSE4_INSTRUCTIONS "Compile your program with SSE4 instructions" OFF)
    option(USE_AVX_INSTRUCTIONS  "Compile your program with AVX instructions"  OFF)

--- a/dlib/image_transforms/fhog.h
+++ b/dlib/image_transforms/fhog.h
@@ -11,8 +11,7 @@
 #include "assign_image.h"
 #include "draw.h"
 #include "interpolation.h"
-#include "../simd/simd4i.h"
+#include "../simd.h"
-#include "../simd/simd4f.h"
 namespace dlib
 {

--- a/dlib/simd/simd4f.h
+++ b/dlib/simd/simd4f.h
@@ -83,6 +83,58 @@ namespace dlib
    private:
        __m128 x;
    };
+#elif defined(DLIB_HAVE_VSX)
+    class simd4f
+    {    
+        typedef union {
+            vector float v;
+            float x[4];
+        } v4f;
+        v4f x;
+    public:
+        inline simd4f() : x{0,0,0,0} {}
+        inline simd4f(const simd4f& v) : x(v.x) { }
+        inline simd4f(const vector float& v) : x{v} { }
+        inline simd4f(const simd4i& v) {
+            x.x[0]=v[0]; x.x[1]=v[1]; x.x[2]=v[2]; x.x[3]=v[3];
+        }
+        inline simd4f(float f) : x{f,f,f,f} { }
+        inline simd4f(float r0, float r1, float r2, float r3)
+             : x{r0,r1,r2,r3} { }
+        inline simd4f& operator=(const simd4f& v) { x = v.x; return *this; }
+        inline simd4f& operator=(const float& v) { *this = simd4f(v); return *this; }
+        inline vector float operator() () const { return x.v; }
+        inline float operator[](unsigned int idx) const { return x.x[idx]; }
+        inline void load_aligned(const float* ptr)  { x.v = vec_ld(0, ptr); }
+        inline void store_aligned(float* ptr) const { vec_st(x.v, 0, ptr); }
+        inline void load(const float* ptr) { x.v = vec_vsx_ld(0, ptr); }
+        inline void store(float* ptr) const { vec_vsx_st(x.v, 0, ptr); }
+        // truncate to 32bit integers
+        inline operator simd4i::rawarray() const 
+        { 
+            simd4i::rawarray temp;
+            temp.v.x[0] = x.x[0];
+            temp.v.x[1] = x.x[1];
+            temp.v.x[2] = x.x[2];
+            temp.v.x[3] = x.x[3];
+            return temp;
+        }
+    };
+    typedef simd4i simd4f_bool;
 #else
    class simd4f
    {
@@ -190,6 +242,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_add_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_add(lhs(), rhs());
 #else
        return simd4f(lhs[0]+rhs[0],
                      lhs[1]+rhs[1],
@@ -206,6 +260,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_sub_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_sub(lhs(), rhs());
 #else
        return simd4f(lhs[0]-rhs[0],
                      lhs[1]-rhs[1],
@@ -222,6 +278,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_mul_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_mul(lhs(), rhs());
 #else
        return simd4f(lhs[0]*rhs[0],
                      lhs[1]*rhs[1],
@@ -238,6 +296,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_div_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_div(lhs(), rhs());
 #else
        return simd4f(lhs[0]/rhs[0],
                      lhs[1]/rhs[1],
@@ -254,6 +314,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_cmpeq_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_cmpeq(lhs(), rhs());
 #else
        return simd4f_bool(lhs[0]==rhs[0],
                           lhs[1]==rhs[1],
@@ -268,6 +330,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_cmpneq_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return ~(lhs==rhs);     // simd4f_bool is simd4i typedef, can use ~
 #else
        return simd4f_bool(lhs[0]!=rhs[0],
                           lhs[1]!=rhs[1],
@@ -282,6 +346,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_cmplt_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_cmplt(lhs(), rhs());
 #else
        return simd4f_bool(lhs[0]<rhs[0],
                           lhs[1]<rhs[1],
@@ -303,6 +369,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_cmple_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_cmple(lhs(), rhs());
 #else
        return simd4f_bool(lhs[0]<=rhs[0],
                           lhs[1]<=rhs[1],
@@ -324,6 +392,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_min_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_min(lhs(), rhs());
 #else
        return simd4f(std::min(lhs[0],rhs[0]),
                      std::min(lhs[1],rhs[1]),
@@ -338,6 +408,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_max_ps(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_max(lhs(), rhs());
 #else
        return simd4f(std::max(lhs[0],rhs[0]),
                      std::max(lhs[1],rhs[1]),
@@ -352,6 +424,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_rcp_ps(item); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_re(item());
 #else
        return simd4f(1.0f/item[0],
                      1.0f/item[1],
@@ -366,6 +440,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_rsqrt_ps(item); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_rsqrt(item());
 #else
        return simd4f(1.0f/std::sqrt(item[0]),
                      1.0f/std::sqrt(item[1]),
@@ -410,6 +486,8 @@ namespace dlib
    {
 #ifdef DLIB_HAVE_SSE2
        return _mm_sqrt_ps(item);
+#elif defined(DLIB_HAVE_VSX)
+        return vec_sqrt(item());
 #else
        return simd4f(std::sqrt(item[0]),
                      std::sqrt(item[1]),
@@ -434,6 +512,8 @@ namespace dlib
        simd4f temp2;
        temp2.load(temp);
        return temp2;
+#elif defined(DLIB_HAVE_VSX)
+        return vec_ceil(item());
 #else
        return simd4f(std::ceil(item[0]),
                      std::ceil(item[1]),
@@ -458,6 +538,8 @@ namespace dlib
        simd4f temp2;
        temp2.load(temp);
        return temp2;
+#elif defined(DLIB_HAVE_VSX)
+        return vec_floor(item());
 #else
        return simd4f(std::floor(item[0]),
                      std::floor(item[1]),

--- a/dlib/simd/simd4i.h
+++ b/dlib/simd/simd4i.h
@@ -44,6 +44,52 @@ namespace dlib
    private:
        __m128i x;
    };
+#elif defined(DLIB_HAVE_VSX)
+    class simd4i
+    {    
+        typedef union {
+            vector signed int v;
+            vector bool int b;
+            signed int x[4];
+        } v4i;
+        v4i x;
+    public:
+        inline simd4i() : x{0,0,0,0} { }
+        inline simd4i(const simd4i& v) : x(v.x) { }
+        inline simd4i(const vector int& v) : x{v} { }
+        inline simd4i(const vector bool int& b) { x.b=b; }
+        inline simd4i(int32 f) : x{f,f,f,f} { }
+        inline simd4i(int32 r0, int32 r1, int32 r2, int32 r3)
+             : x{r0,r1,r2,r3} { }		
+        inline simd4i& operator=(const simd4i& v) { x = v.x; return *this; }
+        inline simd4i& operator=(const int32& v) { *this = simd4i(v); return *this; }
+        inline vector signed int operator() () const { return x.v; }
+        inline int32 operator[](unsigned int idx) const { return x.x[idx]; }
+        inline vector bool int to_bool() const { return x.b; }
+        // intrinsics now seem to use xxpermdi automatically now
+        inline void load_aligned(const int32* ptr)  { x.v = vec_ld(0, ptr); }
+        inline void store_aligned(int32* ptr) const { vec_st(x.v, 0, ptr); }
+        inline void load(const int32* ptr) { x.v = vec_vsx_ld(0, ptr); }
+        inline void store(int32* ptr) const { vec_vsx_st(x.v, 0, ptr); }
+        struct rawarray
+        {
+            v4i v;
+        };
+        inline simd4i(const rawarray& a) : x{a.v} { }
+    };
 #else
    class simd4i
@@ -117,6 +163,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_add_epi32(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_add(lhs(), rhs());
 #else
        return simd4i(lhs[0]+rhs[0],
                      lhs[1]+rhs[1],
@@ -133,6 +181,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_sub_epi32(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_sub(lhs(), rhs());
 #else
        return simd4i(lhs[0]-rhs[0],
                      lhs[1]-rhs[1],
@@ -156,6 +206,10 @@ namespace dlib
                      _lhs[1]*_rhs[1],
                      _lhs[2]*_rhs[2],
                      _lhs[3]*_rhs[3]);
+#elif defined(DLIB_HAVE_VSX)
+        vector int a = lhs(), b = rhs();
+        asm("vmuluwm %0, %0, %1\n\t" : "+&v" (a) : "v" (b) );
+        return simd4i(a);
 #else
        return simd4i(lhs[0]*rhs[0],
                      lhs[1]*rhs[1],
@@ -172,6 +226,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_and_si128(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_and(lhs(), rhs());
 #else
        return simd4i(lhs[0]&rhs[0],
                      lhs[1]&rhs[1],
@@ -188,6 +244,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_or_si128(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_or(lhs(), rhs());
 #else
        return simd4i(lhs[0]|rhs[0],
                      lhs[1]|rhs[1],
@@ -204,6 +262,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_xor_si128(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_xor(lhs(), rhs());
 #else
        return simd4i(lhs[0]^rhs[0],
                      lhs[1]^rhs[1],
@@ -220,6 +280,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_xor_si128(lhs, _mm_set1_epi32(0xFFFFFFFF)); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_xor(lhs(), vec_splats(~0));
 #else
        return simd4i(~lhs[0],
                      ~lhs[1],
@@ -234,6 +296,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_sll_epi32(lhs,_mm_cvtsi32_si128(rhs));
+#elif defined(DLIB_HAVE_VSX)
+        return vec_sl(lhs(), vec_splats((uint32_t)rhs));         
 #else
        return simd4i(lhs[0]<<rhs,
                      lhs[1]<<rhs,
@@ -250,6 +314,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_sra_epi32(lhs,_mm_cvtsi32_si128(rhs));
+#elif defined(DLIB_HAVE_VSX)
+        return vec_sr(lhs(), vec_splats((uint32_t)rhs)); 
 #else
        return simd4i(lhs[0]>>rhs,
                      lhs[1]>>rhs,
@@ -266,6 +332,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_cmpeq_epi32(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_cmpeq(lhs(), rhs());
 #else
        return simd4i(lhs[0]==rhs[0] ? 0xFFFFFFFF : 0,
                      lhs[1]==rhs[1] ? 0xFFFFFFFF : 0,
@@ -278,7 +346,7 @@ namespace dlib
    inline simd4i operator!= (const simd4i& lhs, const simd4i& rhs) 
    { 
-#ifdef DLIB_HAVE_SSE2
+#if defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_VSX)
        return ~(lhs==rhs);
 #else
        return simd4i(lhs[0]!=rhs[0] ? 0xFFFFFFFF : 0,
@@ -294,6 +362,8 @@ namespace dlib
    { 
 #ifdef DLIB_HAVE_SSE2
        return _mm_cmplt_epi32(lhs, rhs); 
+#elif defined(DLIB_HAVE_VSX)
+        return vec_cmplt(lhs(), rhs());
 #else
        return simd4i(lhs[0]<rhs[0] ? 0xFFFFFFFF : 0,
                      lhs[1]<rhs[1] ? 0xFFFFFFFF : 0,
@@ -343,6 +413,8 @@ namespace dlib
                      std::min(_lhs[1],_rhs[1]),
                      std::min(_lhs[2],_rhs[2]),
                      std::min(_lhs[3],_rhs[3]));
+#elif defined(DLIB_HAVE_VSX)
+        return vec_min(lhs(), rhs());
 #else
        return simd4i(std::min(lhs[0],rhs[0]),
                      std::min(lhs[1],rhs[1]),
@@ -364,6 +436,8 @@ namespace dlib
                      std::max(_lhs[1],_rhs[1]),
                      std::max(_lhs[2],_rhs[2]),
                      std::max(_lhs[3],_rhs[3]));
+#elif defined(DLIB_HAVE_VSX)
+        return vec_max(lhs(), rhs());
 #else
        return simd4i(std::max(lhs[0],rhs[0]),
                      std::max(lhs[1],rhs[1]),
@@ -398,6 +472,8 @@ namespace dlib
        return _mm_blendv_epi8(b,a,cmp);
 #elif defined(DLIB_HAVE_SSE2)
        return ((cmp&a) | _mm_andnot_si128(cmp,b));
+#elif defined(DLIB_HAVE_VSX)
+        return vec_sel(b(), a(), cmp.to_bool());
 #else
        return ((cmp&a) | (~cmp&b));
 #endif

--- a/dlib/simd/simd8f.h
+++ b/dlib/simd/simd8f.h
@@ -7,7 +7,6 @@
 #include "simd4f.h"
 #include "simd8i.h"
 namespace dlib
 {
 #ifdef DLIB_HAVE_AVX

--- a/dlib/simd/simd_check.h
+++ b/dlib/simd/simd_check.h
@@ -51,12 +51,33 @@
                #define DLIB_HAVE_AVX2
            #endif
        #endif
+        #ifdef __ALTIVEC__
+            #ifndef DLIB_HAVE_ALTIVEC
+                #define DLIB_HAVE_ALTIVEC
+            #endif
+        #endif
+        #ifdef __VSX__
+            #ifndef DLIB_HAVE_VSX
+                #define DLIB_HAVE_VSX
+            #endif
+        #endif
+        #ifdef __VEC__ // __VEC__ = 10206
+            #ifndef DLIB_HAVE_POWER_VEC	// vector and vec_ intrinsics
+                #define DLIB_HAVE_POWER_VEC
+            #endif
+        #endif
    #endif
 #endif
 // ----------------------------------------------------------------------------------------
+#ifdef DLIB_HAVE_ALTIVEC
+#include <altivec.h>
+#endif
 #ifdef DLIB_HAVE_SSE2
    #include <xmmintrin.h>
    #include <emmintrin.h>
@@ -77,6 +98,8 @@
 //    #include <avx2intrin.h>
 #endif
 #endif // DLIB_SIMd_CHECK_Hh_