Intel SVML support
Roberto Agostino Vitillo
[please enable javascript to see the address]
Thu Jan 31 22:28:48 CET 2013
Hi,
The following patch adds support for the Intel SVML library. Intel SVML has an accuracy of 4 ulp (typically 2) and in general it seems to outperform Vc by a factor of up to 2 (Ivy Bridge). The Intel library provides also a higher accuracy for double precision which is vital for the science experiment I am working for.
Support is enabled by passing to cmake the path of the Intel SVML library through the INTEL_SVML_PATH flag i.e. cmake -DINTEL_SVML_PATH=/opt/intel/composerxe/compiler/lib/intel64/.
The following tests are failing on Linux when enabling SVML:
c++11_math_sse (Failed)
c++11_math_avx (Failed)
math_VC_LOG_ILP_sse (Failed)
math_VC_LOG_ILP_avx (Failed)
c++11_math_VC_LOG_ILP_sse (Failed)
c++11_math_VC_LOG_ILP_avx (Failed)
math_VC_LOG_ILP2_sse (Failed)
math_VC_LOG_ILP2_avx (Failed)
c++11_math_VC_LOG_ILP2_sse (Failed)
c++11_math_VC_LOG_ILP2_avx (Failed)
They all fail on exp() and log(). Vc allows a distance of 1 and 2 for single and double precision respectively while SVML has a distance of 3 in some cases.
I am sure the code can be organized better architecturally but it should provide everything you need to hopefully add support for the Intel library.
Roberto
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9895338..83c4a46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,11 @@ if(Vc_COMPILER_IS_INTEL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w1 -fp-model precise")
endif()
+if(INTEL_SVML_PATH)
+ add_definitions(-DUSE_INTEL_SVML)
+ set(CMAKE_EXE_LINKER_FLAGS "-L ${INTEL_SVML}/ -lsvml")
+endif(INTEL_SVML_PATH)
+
if(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]")
message(STATUS "WARNING! It seems you are compiling without optimization. Please set CMAKE_BUILD_TYPE.")
endif(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]")
diff --git a/common/exponential.h b/common/exponential.h
index 9063172..1f14e20 100644
--- a/common/exponential.h
+++ b/common/exponential.h
@@ -49,6 +49,40 @@ namespace Common
template<typename T> struct TypenameForLdexp { typedef Vector<int> Type; };
template<> struct TypenameForLdexp<Vc::sfloat> { typedef Vector<short> Type; };
+#if defined(USE_INTEL_SVML)
+#if defined(VC_IMPL_SSE)
+static inline Vector<float> exp(VC_ALIGNED_PARAMETER(Vector<float>) x) {
+ Vector<float> tmp;
+ tmp.data() = __svml_expf4(x.data());
+ return tmp;
+}
+
+static inline Vector<sfloat> exp(VC_ALIGNED_PARAMETER(Vector<sfloat>) x) {
+ Vector<sfloat> tmp;
+ tmp.data()[0] = __svml_expf4(x.data()[0]);
+ tmp.data()[1] = __svml_expf4(x.data()[1]);
+ return tmp;
+}
+
+static inline Vector<double> exp(VC_ALIGNED_PARAMETER(Vector<double>) x) {
+ Vector<double> tmp;
+ tmp.data() = __svml_exp2(x.data());
+ return tmp;
+}
+#else
+template<typename T> static inline Vector<T> exp(VC_ALIGNED_PARAMETER(Vector<T>) x) {
+ Vector<T> tmp;
+ tmp.data() = __svml_expf8(x.data());
+ return tmp;
+}
+
+template<> inline Vector<double> exp(VC_ALIGNED_PARAMETER(Vector<double>) x) {
+ Vector<double> tmp;
+ tmp.data() = __svml_exp4(x.data());
+ return tmp;
+}
+#endif
+#else
template<typename T> static inline Vector<T> exp(VC_ALIGNED_PARAMETER(Vector<T>) _x) {
typedef Vector<T> V;
typedef typename V::Mask M;
@@ -131,6 +165,7 @@ namespace Common
return x;
}
+#endif
} // namespace Common
namespace VC__USE_NAMESPACE
{
diff --git a/common/logarithm.h b/common/logarithm.h
index f5b8455..5247ce6 100644
--- a/common/logarithm.h
+++ b/common/logarithm.h
@@ -49,6 +49,8 @@
#define VC_COMMON_LOGARITHM_H
#include "macros.h"
+#include "svml.h"
+
namespace Vc
{
namespace Common
@@ -56,6 +58,9 @@ namespace Common
#ifdef VC__USE_NAMESPACE
using Vc::VC__USE_NAMESPACE::Const;
using Vc::VC__USE_NAMESPACE::Vector;
+using Vc::VC__USE_NAMESPACE::float_v;
+using Vc::VC__USE_NAMESPACE::sfloat_v;
+using Vc::VC__USE_NAMESPACE::double_v;
#endif
enum LogarithmBase {
BaseE, Base10, Base2
@@ -166,8 +171,8 @@ struct LogImpl
}
}
- static inline Vc_ALWAYS_INLINE void log_series(Vector<double> &VC_RESTRICT x, Vector<double>::AsArg exponent) {
- typedef Vector<double> V;
+ static inline Vc_ALWAYS_INLINE void log_series(double_v &VC_RESTRICT x, double_v::AsArg exponent) {
+ typedef double_v V;
typedef Const<double> C;
const V x2 = x * x;
V y = C::P(0);
@@ -246,6 +251,107 @@ struct LogImpl
}
};
+#if defined(USE_INTEL_SVML)
+#if defined(VC_IMPL_SSE)
+// log
+static inline float_v log(VC_ALIGNED_PARAMETER(float_v) x) {
+ float_v tmp;
+ tmp.data() = __svml_logf4(x.data());
+ return tmp;
+}
+
+static inline sfloat_v log(VC_ALIGNED_PARAMETER(sfloat_v) x) {
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_logf4(x.data()[0]);
+ tmp.data()[1] = __svml_logf4(x.data()[1]);
+ return tmp;
+}
+
+static inline double_v log(VC_ALIGNED_PARAMETER(double_v) x) {
+ double_v tmp;
+ tmp.data() = __svml_log2(x.data());
+ return tmp;
+}
+
+// log10
+static inline float_v log10(VC_ALIGNED_PARAMETER(float_v) x) {
+ float_v tmp;
+ tmp.data() = __svml_log10f4(x.data());
+ return tmp;
+}
+
+static inline sfloat_v log10(VC_ALIGNED_PARAMETER(sfloat_v) x) {
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_log10f4(x.data()[0]);
+ tmp.data()[1] = __svml_log10f4(x.data()[1]);
+ return tmp;
+}
+
+static inline double_v log10(VC_ALIGNED_PARAMETER(double_v) x) {
+ double_v tmp;
+ tmp.data() = __svml_log102(x.data());
+ return tmp;
+}
+// log2
+static inline float_v log2(VC_ALIGNED_PARAMETER(float_v) x) {
+ float_v tmp;
+ tmp.data() = __svml_log2f4(x.data());
+ return tmp;
+}
+
+static inline sfloat_v log2(VC_ALIGNED_PARAMETER(sfloat_v) x) {
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_log2f4(x.data()[0]);
+ tmp.data()[1] = __svml_log2f4(x.data()[1]);
+ return tmp;
+}
+
+static inline double_v log2(VC_ALIGNED_PARAMETER(double_v) x) {
+ double_v tmp;
+ tmp.data() = __svml_log22(x.data());
+ return tmp;
+}
+#else
+// log
+template<typename T> static inline Vector<T> log(VC_ALIGNED_PARAMETER(Vector<T>) x) {
+ Vector<T> tmp;
+ tmp.data() = __svml_logf8(x.data());
+ return tmp;
+}
+
+template<> inline double_v log(VC_ALIGNED_PARAMETER(double_v) x) {
+ double_v tmp;
+ tmp.data() = __svml_log4(x.data());
+ return tmp;
+}
+
+// log10
+template<typename T> static inline Vector<T> log10(VC_ALIGNED_PARAMETER(Vector<T>) x) {
+ Vector<T> tmp;
+ tmp.data() = __svml_log10f8(x.data());
+ return tmp;
+}
+
+template<> inline double_v log10(VC_ALIGNED_PARAMETER(double_v) x) {
+ double_v tmp;
+ tmp.data() = __svml_log104(x.data());
+ return tmp;
+}
+
+// log2
+template<typename T> static inline Vector<T> log2(VC_ALIGNED_PARAMETER(Vector<T>) x) {
+ Vector<T> tmp;
+ tmp.data() = __svml_log2f8(x.data());
+ return tmp;
+}
+
+template<> inline double_v log2(VC_ALIGNED_PARAMETER(double_v) x) {
+ double_v tmp;
+ tmp.data() = __svml_log24(x.data());
+ return tmp;
+}
+#endif
+#else
template<typename T> static inline Vector<T> log(VC_ALIGNED_PARAMETER(Vector<T>) x) {
typedef typename Vector<T>::Mask M;
typedef Const<T> C;
@@ -261,6 +367,8 @@ template<typename T> static inline Vector<T> log2(VC_ALIGNED_PARAMETER(Vector<T>
typedef Const<T> C;
return LogImpl<Base2>::calc(x);
}
+#endif
+
} // namespace Common
#ifdef VC__USE_NAMESPACE
namespace VC__USE_NAMESPACE
diff --git a/common/svml.h b/common/svml.h
new file mode 100644
index 0000000..8ecd782
--- /dev/null
+++ b/common/svml.h
@@ -0,0 +1,68 @@
+/* This file is part of the Vc library.
+
[please enable javascript to see the address]>
+
+ Vc is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation, either version 3 of
+ the License, or (at your option) any later version.
+
+ Vc is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with Vc. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef VC_COMMON_SVML_H
+#define VC_COMMON_SVML_H
+
+#if defined(USE_INTEL_SVML)
+extern "C"{
+__m128 __svml_sinf4(__m128 v1);
+__m128d __svml_sin2(__m128d v1);
+__m128 __svml_cosf4(__m128 v1);
+__m128d __svml_cos2(__m128d v1);
+__m128 __svml_sincosf4(__m128 v1);
+__m128d __svml_sincos2(__m128d v1);
+__m128 __svml_asinf4(__m128 v1);
+__m128d __svml_asin2(__m128d v1);
+__m128 __svml_atanf4(__m128 v1);
+__m128d __svml_atan2(__m128d v1);
+__m128 __svml_atan2f4(__m128 v1, __m128 v2);
+__m128d __svml_atan22(__m128d v1, __m128d v2);
+__m128 __svml_logf4(__m128 v1);
+__m128d __svml_log2(__m128d v1);
+__m128 __svml_log2f4(__m128 v1);
+__m128d __svml_log22(__m128d v1);
+__m128 __svml_log10f4(__m128 v1);
+__m128d __svml_log102(__m128d v1);
+__m128 __svml_expf4(__m128 v1);
+__m128d __svml_exp2(__m128d v1);
+
+__m256 __svml_sinf8(__m256 v1);
+__m256d __svml_sin4(__m256d v1);
+__m256 __svml_cosf8(__m256 v1);
+__m256d __svml_cos4(__m256d v1);
+__m256 __svml_sincosf8(__m256 v1);
+__m256d __svml_sincos4(__m256d v1);
+__m256 __svml_asinf8(__m256 v1);
+__m256d __svml_asin4(__m256d v1);
+__m256 __svml_atanf8(__m256 v1);
+__m256d __svml_atan4(__m256d v1);
+__m256 __svml_atan2f8(__m256 v1, __m256 v2);
+__m256d __svml_atan24(__m256d v1, __m256d v2);
+__m256 __svml_logf8(__m256 v1);
+__m256d __svml_log4(__m256d v1);
+__m256 __svml_log2f8(__m256 v1);
+__m256d __svml_log24(__m256d v1);
+__m256 __svml_log10f8(__m256 v1);
+__m256d __svml_log104(__m256d v1);
+__m256 __svml_expf8(__m256 v1);
+__m256d __svml_exp4(__m256d v1);
+}
+#endif
+
+#endif
diff --git a/src/trigonometric.cpp b/src/trigonometric.cpp
index e24bc93..2e41059 100644
--- a/src/trigonometric.cpp
+++ b/src/trigonometric.cpp
@@ -20,6 +20,7 @@
#include <Vc/Vc>
#if defined(VC_IMPL_SSE) || defined(VC_IMPL_AVX)
#include <common/macros.h>
+#include <common/svml.h>
namespace Vc
{
@@ -74,6 +75,229 @@ namespace
}
} // anonymous namespace
+#if defined(USE_INTEL_SVML)
+#if defined(VC_IMPL_SSE)
+// sin
+template<> template<> float_v Trigonometric<VC_IMPL>::sin(const float_v &_x){
+ float_v tmp;
+ tmp.data() = __svml_sinf4(_x.data());
+ return tmp;
+}
+
+template<> template<> sfloat_v Trigonometric<VC_IMPL>::sin(const sfloat_v &_x){
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_sinf4(_x.data()[0]);
+ tmp.data()[1] = __svml_sinf4(_x.data()[1]);
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::sin(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_sin2(_x.data());
+ return tmp;
+}
+
+// cos
+template<> template<> float_v Trigonometric<VC_IMPL>::cos(const float_v &_x){
+ float_v tmp;
+ tmp.data() = __svml_cosf4(_x.data());
+ return tmp;
+}
+
+template<> template<> sfloat_v Trigonometric<VC_IMPL>::cos(const sfloat_v &_x){
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_cosf4(_x.data()[0]);
+ tmp.data()[1] = __svml_cosf4(_x.data()[1]);
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::cos(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_cos2(_x.data());
+ return tmp;
+}
+
+// sincos
+template<> template<> void Trigonometric<VC_IMPL>::sincos(const float_v &_x, float_v *_sin, float_v *_cos) {
+ _sin->data() = __svml_sincosf4(_x.data());
+#if defined(__unix__) || defined(__GNUC__)
+ __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(_cos->data()));
+#else // Windows
+ _asm vmovapd _cos->data(), ymm1;
+#endif
+}
+
+template<> template<> void Trigonometric<VC_IMPL>::sincos(const sfloat_v &_x, sfloat_v *_sin, sfloat_v *_cos) {
+ _sin->data()[0] = __svml_sincosf4(_x.data()[0]);
+#if defined(__unix__) || defined(__GNUC__)
+ __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(_cos->data()[0]));
+#else // Windows
+ _asm vmovapd _cos->data()[0], ymm1;
+#endif
+
+ _sin->data()[1] = __svml_sincosf4(_x.data()[1]);
+#if defined(__unix__) || defined(__GNUC__)
+ __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(_cos->data()[1]));
+#else // Windows
+ _asm vmovapd _cos->data()[1], ymm1;
+#endif
+}
+
+template<> template<> void Trigonometric<VC_IMPL>::sincos(const double_v &_x, double_v *_sin, double_v *_cos) {
+ _sin->data() = __svml_sincos2(_x.data());
+#if defined(__unix__) || defined(__GNUC__)
+ __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(_cos->data()));
+#else // Windows
+ _asm vmovapd _cos->data(), ymm1;
+#endif
+}
+
+// asin
+template<> template<> float_v Trigonometric<VC_IMPL>::asin(const float_v &_x){
+ float_v tmp;
+ tmp.data() = __svml_asinf4(_x.data());
+ return tmp;
+}
+
+template<> template<> sfloat_v Trigonometric<VC_IMPL>::asin(const sfloat_v &_x){
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_asinf4(_x.data()[0]);
+ tmp.data()[1] = __svml_asinf4(_x.data()[1]);
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::asin(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_asin2(_x.data());
+ return tmp;
+}
+
+// atan
+template<> template<> float_v Trigonometric<VC_IMPL>::atan(const float_v &_x){
+ float_v tmp;
+ tmp.data() = __svml_atanf4(_x.data());
+ return tmp;
+}
+
+template<> template<> sfloat_v Trigonometric<VC_IMPL>::atan(const sfloat_v &_x){
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_atanf4(_x.data()[0]);
+ tmp.data()[1] = __svml_atanf4(_x.data()[1]);
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::atan(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_atan2(_x.data());
+ return tmp;
+}
+
+// atan2
+template<> template<> float_v Trigonometric<VC_IMPL>::atan2(const float_v &_x, const float_v &_y){
+ float_v tmp;
+ tmp.data() = __svml_atan2f4(_x.data(), _y.data());
+ return tmp;
+}
+
+template<> template<> sfloat_v Trigonometric<VC_IMPL>::atan2(const sfloat_v &_x, const sfloat_v &_y){
+ sfloat_v tmp;
+ tmp.data()[0] = __svml_atan2f4(_x.data()[0], _y.data()[0]);
+ tmp.data()[1] = __svml_atan2f4(_x.data()[1], _y.data()[1]);
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::atan2(const double_v &_x, const double_v &_y){
+ double_v tmp;
+ tmp.data() = __svml_atan22(_x.data(), _y.data());
+ return tmp;
+}
+#else
+// sin
+template<> template<typename _T> Vector<_T> Trigonometric<VC_IMPL>::sin(const Vector<_T> &_x){
+ Vector<_T> tmp;
+ tmp.data() = __svml_sinf8(_x.data());
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::sin(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_sin4(_x.data());
+ return tmp;
+}
+
+// cos
+template<> template<typename _T> Vector<_T> Trigonometric<VC_IMPL>::cos(const Vector<_T> &_x){
+ Vector<_T> tmp;
+ tmp.data() = __svml_cosf8(_x.data());
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::cos(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_cos4(_x.data());
+ return tmp;
+}
+
+// sincos
+template<> template<typename _T> void Trigonometric<VC_IMPL>::sincos(const Vector<_T> &_x, Vector<_T> *_sin, Vector<_T> *_cos) {
+ _sin->data() = __svml_sincosf8(_x.data());
+#if defined(__unix__) || defined(__GNUC__)
+ __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(_cos->data()));
+#else // Windows
+ _asm vmovaps _cos->data(), ymm1;
+#endif
+}
+
+template<> template<> void Trigonometric<VC_IMPL>::sincos(const double_v &_x, double_v *_sin, double_v *_cos) {
+ _sin->data() = __svml_sincos4(_x.data());
+#if defined(__unix__) || defined(__GNUC__)
+ __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(_cos->data()));
+#else // Windows
+ _asm vmovaps _cos->data(), ymm1;
+#endif
+}
+
+// asin
+template<> template<typename _T> Vector<_T> Trigonometric<VC_IMPL>::asin(const Vector<_T> &_x){
+ Vector<_T> tmp;
+ tmp.data() = __svml_asinf8(_x.data());
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::asin(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_asin4(_x.data());
+ return tmp;
+}
+
+// atan
+template<> template<typename _T> Vector<_T> Trigonometric<VC_IMPL>::atan(const Vector<_T> &_x){
+ Vector<_T> tmp;
+ tmp.data() = __svml_atanf8(_x.data());
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::atan(const double_v &_x){
+ double_v tmp;
+ tmp.data() = __svml_atan4(_x.data());
+ return tmp;
+}
+
+// atan2
+template<> template<typename _T> Vector<_T> Trigonometric<VC_IMPL>::atan2(const Vector<_T> &_x, const Vector<_T> &_y){
+ Vector<_T> tmp;
+ tmp.data() = __svml_atan2f8(_x.data(), _y.data());
+ return tmp;
+}
+
+template<> template<> double_v Trigonometric<VC_IMPL>::atan2(const double_v &_x, const double_v &_y){
+ double_v tmp;
+ tmp.data() = __svml_atan24(_x.data(), _y.data());
+ return tmp;
+}
+#endif
+#else
+
/*
* algorithm for sine and cosine:
*
@@ -472,6 +696,8 @@ template<> template<> double_v Trigonometric<VC_IMPL>::atan2 (const double_v &y,
return a;
}
+#endif
+
} // namespace Vc
#include <common/undomacros.h>
More information about the Vc-devel
mailing list