Commit 60092335 authored by David Miller's avatar David Miller Committed by Davis E. King

GCC/Clang compatible SIMD code ./dlib/simd/simd**_vec.h (#414)

* GCC/Clang compatible vector extension SIMD code

* Minimal modifications to dlib for the simd_vec code to work, a few include changes and ifdefs

* Changed tabbing to spaces

* Allow type inference to binary ops on different types of same size

* Added cmake option USE_AUTO_VECTOR, and fixed up preprocessor checks.
It is required to build with gcc/clang auto vectorization

* Changed to intrinsic version due to poor auto vectorization results.
The simd8*_vec are just copies of the C code right now.

* Removed _vec variants, added to existing defines. simd_check.h back in place and removed from dlib/simd.h
parent f6ece5d2
......@@ -32,9 +32,10 @@ endif()
set(gcc_like_compilers GNU Clang Intel)
set(intel_archs x86_64 i386 i686)
# Setup some options to allow a user to enable SSE and AVX instruction use.
if ((";${gcc_like_compilers};" MATCHES ";${CMAKE_CXX_COMPILER_ID};") AND
(";${intel_archs};" MATCHES ";${CMAKE_SYSTEM_PROCESSOR};"))
if ((";${gcc_like_compilers};" MATCHES ";${CMAKE_CXX_COMPILER_ID};") AND
(";${intel_archs};" MATCHES ";${CMAKE_SYSTEM_PROCESSOR};") AND NOT USE_AUTO_VECTOR)
option(USE_SSE2_INSTRUCTIONS "Compile your program with SSE2 instructions" OFF)
option(USE_SSE4_INSTRUCTIONS "Compile your program with SSE4 instructions" OFF)
option(USE_AVX_INSTRUCTIONS "Compile your program with AVX instructions" OFF)
......
......@@ -11,8 +11,7 @@
#include "assign_image.h"
#include "draw.h"
#include "interpolation.h"
#include "../simd/simd4i.h"
#include "../simd/simd4f.h"
#include "../simd.h"
namespace dlib
{
......
......@@ -83,6 +83,58 @@ namespace dlib
private:
__m128 x;
};
#elif defined(DLIB_HAVE_VSX)
class simd4f
{
typedef union {
vector float v;
float x[4];
} v4f;
v4f x;
public:
inline simd4f() : x{0,0,0,0} {}
inline simd4f(const simd4f& v) : x(v.x) { }
inline simd4f(const vector float& v) : x{v} { }
inline simd4f(const simd4i& v) {
x.x[0]=v[0]; x.x[1]=v[1]; x.x[2]=v[2]; x.x[3]=v[3];
}
inline simd4f(float f) : x{f,f,f,f} { }
inline simd4f(float r0, float r1, float r2, float r3)
: x{r0,r1,r2,r3} { }
inline simd4f& operator=(const simd4f& v) { x = v.x; return *this; }
inline simd4f& operator=(const float& v) { *this = simd4f(v); return *this; }
inline vector float operator() () const { return x.v; }
inline float operator[](unsigned int idx) const { return x.x[idx]; }
inline void load_aligned(const float* ptr) { x.v = vec_ld(0, ptr); }
inline void store_aligned(float* ptr) const { vec_st(x.v, 0, ptr); }
inline void load(const float* ptr) { x.v = vec_vsx_ld(0, ptr); }
inline void store(float* ptr) const { vec_vsx_st(x.v, 0, ptr); }
// truncate to 32bit integers
inline operator simd4i::rawarray() const
{
simd4i::rawarray temp;
temp.v.x[0] = x.x[0];
temp.v.x[1] = x.x[1];
temp.v.x[2] = x.x[2];
temp.v.x[3] = x.x[3];
return temp;
}
};
typedef simd4i simd4f_bool;
#else
class simd4f
{
......@@ -190,6 +242,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_add_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_add(lhs(), rhs());
#else
return simd4f(lhs[0]+rhs[0],
lhs[1]+rhs[1],
......@@ -206,6 +260,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_sub_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_sub(lhs(), rhs());
#else
return simd4f(lhs[0]-rhs[0],
lhs[1]-rhs[1],
......@@ -222,6 +278,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_mul_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_mul(lhs(), rhs());
#else
return simd4f(lhs[0]*rhs[0],
lhs[1]*rhs[1],
......@@ -238,6 +296,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_div_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_div(lhs(), rhs());
#else
return simd4f(lhs[0]/rhs[0],
lhs[1]/rhs[1],
......@@ -254,6 +314,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_cmpeq_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_cmpeq(lhs(), rhs());
#else
return simd4f_bool(lhs[0]==rhs[0],
lhs[1]==rhs[1],
......@@ -268,6 +330,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_cmpneq_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return ~(lhs==rhs); // simd4f_bool is simd4i typedef, can use ~
#else
return simd4f_bool(lhs[0]!=rhs[0],
lhs[1]!=rhs[1],
......@@ -282,6 +346,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_cmplt_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_cmplt(lhs(), rhs());
#else
return simd4f_bool(lhs[0]<rhs[0],
lhs[1]<rhs[1],
......@@ -303,6 +369,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_cmple_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_cmple(lhs(), rhs());
#else
return simd4f_bool(lhs[0]<=rhs[0],
lhs[1]<=rhs[1],
......@@ -324,6 +392,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_min_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_min(lhs(), rhs());
#else
return simd4f(std::min(lhs[0],rhs[0]),
std::min(lhs[1],rhs[1]),
......@@ -338,6 +408,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_max_ps(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_max(lhs(), rhs());
#else
return simd4f(std::max(lhs[0],rhs[0]),
std::max(lhs[1],rhs[1]),
......@@ -352,6 +424,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_rcp_ps(item);
#elif defined(DLIB_HAVE_VSX)
return vec_re(item());
#else
return simd4f(1.0f/item[0],
1.0f/item[1],
......@@ -366,6 +440,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_rsqrt_ps(item);
#elif defined(DLIB_HAVE_VSX)
return vec_rsqrt(item());
#else
return simd4f(1.0f/std::sqrt(item[0]),
1.0f/std::sqrt(item[1]),
......@@ -410,6 +486,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_sqrt_ps(item);
#elif defined(DLIB_HAVE_VSX)
return vec_sqrt(item());
#else
return simd4f(std::sqrt(item[0]),
std::sqrt(item[1]),
......@@ -434,6 +512,8 @@ namespace dlib
simd4f temp2;
temp2.load(temp);
return temp2;
#elif defined(DLIB_HAVE_VSX)
return vec_ceil(item());
#else
return simd4f(std::ceil(item[0]),
std::ceil(item[1]),
......@@ -458,6 +538,8 @@ namespace dlib
simd4f temp2;
temp2.load(temp);
return temp2;
#elif defined(DLIB_HAVE_VSX)
return vec_floor(item());
#else
return simd4f(std::floor(item[0]),
std::floor(item[1]),
......
......@@ -44,6 +44,52 @@ namespace dlib
private:
__m128i x;
};
#elif defined(DLIB_HAVE_VSX)
class simd4i
{
typedef union {
vector signed int v;
vector bool int b;
signed int x[4];
} v4i;
v4i x;
public:
inline simd4i() : x{0,0,0,0} { }
inline simd4i(const simd4i& v) : x(v.x) { }
inline simd4i(const vector int& v) : x{v} { }
inline simd4i(const vector bool int& b) { x.b=b; }
inline simd4i(int32 f) : x{f,f,f,f} { }
inline simd4i(int32 r0, int32 r1, int32 r2, int32 r3)
: x{r0,r1,r2,r3} { }
inline simd4i& operator=(const simd4i& v) { x = v.x; return *this; }
inline simd4i& operator=(const int32& v) { *this = simd4i(v); return *this; }
inline vector signed int operator() () const { return x.v; }
inline int32 operator[](unsigned int idx) const { return x.x[idx]; }
inline vector bool int to_bool() const { return x.b; }
// intrinsics now seem to use xxpermdi automatically now
inline void load_aligned(const int32* ptr) { x.v = vec_ld(0, ptr); }
inline void store_aligned(int32* ptr) const { vec_st(x.v, 0, ptr); }
inline void load(const int32* ptr) { x.v = vec_vsx_ld(0, ptr); }
inline void store(int32* ptr) const { vec_vsx_st(x.v, 0, ptr); }
struct rawarray
{
v4i v;
};
inline simd4i(const rawarray& a) : x{a.v} { }
};
#else
class simd4i
......@@ -117,6 +163,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_add_epi32(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_add(lhs(), rhs());
#else
return simd4i(lhs[0]+rhs[0],
lhs[1]+rhs[1],
......@@ -133,6 +181,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_sub_epi32(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_sub(lhs(), rhs());
#else
return simd4i(lhs[0]-rhs[0],
lhs[1]-rhs[1],
......@@ -156,6 +206,10 @@ namespace dlib
_lhs[1]*_rhs[1],
_lhs[2]*_rhs[2],
_lhs[3]*_rhs[3]);
#elif defined(DLIB_HAVE_VSX)
vector int a = lhs(), b = rhs();
asm("vmuluwm %0, %0, %1\n\t" : "+&v" (a) : "v" (b) );
return simd4i(a);
#else
return simd4i(lhs[0]*rhs[0],
lhs[1]*rhs[1],
......@@ -172,6 +226,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_and_si128(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_and(lhs(), rhs());
#else
return simd4i(lhs[0]&rhs[0],
lhs[1]&rhs[1],
......@@ -188,6 +244,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_or_si128(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_or(lhs(), rhs());
#else
return simd4i(lhs[0]|rhs[0],
lhs[1]|rhs[1],
......@@ -204,6 +262,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_xor_si128(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_xor(lhs(), rhs());
#else
return simd4i(lhs[0]^rhs[0],
lhs[1]^rhs[1],
......@@ -220,6 +280,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_xor_si128(lhs, _mm_set1_epi32(0xFFFFFFFF));
#elif defined(DLIB_HAVE_VSX)
return vec_xor(lhs(), vec_splats(~0));
#else
return simd4i(~lhs[0],
~lhs[1],
......@@ -234,6 +296,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_sll_epi32(lhs,_mm_cvtsi32_si128(rhs));
#elif defined(DLIB_HAVE_VSX)
return vec_sl(lhs(), vec_splats((uint32_t)rhs));
#else
return simd4i(lhs[0]<<rhs,
lhs[1]<<rhs,
......@@ -250,6 +314,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_sra_epi32(lhs,_mm_cvtsi32_si128(rhs));
#elif defined(DLIB_HAVE_VSX)
return vec_sr(lhs(), vec_splats((uint32_t)rhs));
#else
return simd4i(lhs[0]>>rhs,
lhs[1]>>rhs,
......@@ -266,6 +332,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_cmpeq_epi32(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_cmpeq(lhs(), rhs());
#else
return simd4i(lhs[0]==rhs[0] ? 0xFFFFFFFF : 0,
lhs[1]==rhs[1] ? 0xFFFFFFFF : 0,
......@@ -278,7 +346,7 @@ namespace dlib
inline simd4i operator!= (const simd4i& lhs, const simd4i& rhs)
{
#ifdef DLIB_HAVE_SSE2
#if defined(DLIB_HAVE_SSE2) || defined(DLIB_HAVE_VSX)
return ~(lhs==rhs);
#else
return simd4i(lhs[0]!=rhs[0] ? 0xFFFFFFFF : 0,
......@@ -294,6 +362,8 @@ namespace dlib
{
#ifdef DLIB_HAVE_SSE2
return _mm_cmplt_epi32(lhs, rhs);
#elif defined(DLIB_HAVE_VSX)
return vec_cmplt(lhs(), rhs());
#else
return simd4i(lhs[0]<rhs[0] ? 0xFFFFFFFF : 0,
lhs[1]<rhs[1] ? 0xFFFFFFFF : 0,
......@@ -343,6 +413,8 @@ namespace dlib
std::min(_lhs[1],_rhs[1]),
std::min(_lhs[2],_rhs[2]),
std::min(_lhs[3],_rhs[3]));
#elif defined(DLIB_HAVE_VSX)
return vec_min(lhs(), rhs());
#else
return simd4i(std::min(lhs[0],rhs[0]),
std::min(lhs[1],rhs[1]),
......@@ -364,6 +436,8 @@ namespace dlib
std::max(_lhs[1],_rhs[1]),
std::max(_lhs[2],_rhs[2]),
std::max(_lhs[3],_rhs[3]));
#elif defined(DLIB_HAVE_VSX)
return vec_max(lhs(), rhs());
#else
return simd4i(std::max(lhs[0],rhs[0]),
std::max(lhs[1],rhs[1]),
......@@ -398,6 +472,8 @@ namespace dlib
return _mm_blendv_epi8(b,a,cmp);
#elif defined(DLIB_HAVE_SSE2)
return ((cmp&a) | _mm_andnot_si128(cmp,b));
#elif defined(DLIB_HAVE_VSX)
return vec_sel(b(), a(), cmp.to_bool());
#else
return ((cmp&a) | (~cmp&b));
#endif
......
......@@ -7,7 +7,6 @@
#include "simd4f.h"
#include "simd8i.h"
namespace dlib
{
#ifdef DLIB_HAVE_AVX
......
......@@ -51,12 +51,33 @@
#define DLIB_HAVE_AVX2
#endif
#endif
#ifdef __ALTIVEC__
#ifndef DLIB_HAVE_ALTIVEC
#define DLIB_HAVE_ALTIVEC
#endif
#endif
#ifdef __VSX__
#ifndef DLIB_HAVE_VSX
#define DLIB_HAVE_VSX
#endif
#endif
#ifdef __VEC__ // __VEC__ = 10206
#ifndef DLIB_HAVE_POWER_VEC // vector and vec_ intrinsics
#define DLIB_HAVE_POWER_VEC
#endif
#endif
#endif
#endif
// ----------------------------------------------------------------------------------------
#ifdef DLIB_HAVE_ALTIVEC
#include <altivec.h>
#endif
#ifdef DLIB_HAVE_SSE2
#include <xmmintrin.h>
#include <emmintrin.h>
......@@ -77,6 +98,8 @@
// #include <avx2intrin.h>
#endif
#endif // DLIB_SIMd_CHECK_Hh_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment