TODO
# 利用宏完成编译器设置
See More
#ifndef SIMD_ALWAYS_INLINE
#if (defined(__clang__) && (__clang_major__ >= 12)) || \
(defined(__GNUC__) && !defined(__clang__))
#define SIMD_ALWAYS_INLINE [[gnu::always_inline]]
#else
#define SIMD_ALWAYS_INLINE
#endif
#endif
#if defined( __CUDACC__ )
#define SIMD_CUDA_ALWAYS_INLINE __forceinline__
#endif
#if defined( __HIPCC__ )
#define SIMD_HIP_ALWAYS_INLINE __forceinline__
#endif
#if defined( __CUDACC__) || defined( __HIPCC__ )
#define SIMD_HOST_DEVICE __host__ __device__
#else
#define SIMD_HOST_DEVICE
#endif
#if defined (__CUDACC__) || defined( __HIPCC__ )
#define SIMD_DEVICE __device__
#else
#define SIMD_DEVICE
#endif
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 为矢量化计算进行编译器优化
See More
#ifndef SIMD_PRAGMA
#if defined(_OPENMP)
#define SIMD_PRAGMA _Pragma("omp simd")
#elif defined(__clang__)
#define SIMD_PRAGMA _Pragma("clang loop vectorize(enable)")
#elif defined(__GNUC__) && !defined(__FUJITSU)
#define SIMD_PRAGMA _Pragma("GCC ivdep")
#else
#define SIMD_PRAGMA
#endif
#endif
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# 重载运算符
See More
template <class T, class Abi>
SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd<T, Abi>& operator+=(simd<T, Abi>& a, simd<T, Abi> const& b) {
a = a + b;
return a;
}
1
2
3
4
5
2
3
4
5
# 定义数据存储结构
See More
template <class T, class Abi>
class simd_storage {
T m_value[simd<T, Abi>::size()];
public:
using value_type = T;
using simd_type = simd<T, Abi>;
SIMD_ALWAYS_INLINE inline simd_storage() = default;
SIMD_ALWAYS_INLINE inline static constexpr
int size() { return simd<T, Abi>::size(); }
SIMD_ALWAYS_INLINE explicit inline
simd_storage(simd<T, Abi> const& value) {
value.copy_to(m_value, element_aligned_tag());
}
SIMD_ALWAYS_INLINE explicit inline
simd_storage(T value)
:simd_storage(simd<T, Abi>(value))
{}
SIMD_ALWAYS_INLINE inline
simd_storage& operator=(simd<T, Abi> const& value) {
value.copy_to(m_value, element_aligned_tag());
return *this;
}
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 利用 using
完成类型/类别名,以提供统一的接口
using value_type = bool;
using simd_type = simd<float, simd_abi::avx>;
using abi_type = simd_abi::avx;
1
2
3
2
3
# 利用 constexpr
优化,尽量完成在编译期
SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; }
1
# volatile
需要更多的思考
#ifdef STK_VOLATILE_SIMD
SIMD_ALWAYS_INLINE inline
simd(simd const volatile& value)
:m_value(value.m_value)
{}
#endif
1
2
3
4
5
6
2
3
4
5
6
# 思考
SIMD_ALWAYS_INLINE inline constexpr simd(__m256d const& value_in)
:m_value(value_in)
{}
...
SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const {
return simd(_mm256_mul_pd(m_value, other.m_value));
}
1
2
3
4
5
6
7
2
3
4
5
6
7
是否右值优化会更好
SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE void copy_from(T const* ptr, element_aligned_tag) {
m_value = *ptr;
}
SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE void copy_to(T* ptr, element_aligned_tag) const {
*ptr = m_value;
}
SIMD_ALWAYS_INLINE inline void copy_from(double const* ptr, element_aligned_tag) {
m_value = _mm256_loadu_pd(ptr);
}
SIMD_ALWAYS_INLINE inline void copy_to(double* ptr, element_aligned_tag) const {
_mm256_storeu_pd(ptr, m_value);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
内存分配器的引入是否更快速?