这篇文章的延续(OpenMP MPI CUDA 并行计算 PI 的值)
继上篇文章简单利用了 OpenMP 和 CUDA 计算了 PI 的值,现在利用 oneApi 再试一次。大体思路保持一致。
oneapi 部分代码源于 Intel® oneAPI Math Kernel Library Data Parallel C++ Usage Models (on the Example of Monte Carlo Simulation) (opens new window), 请需知
# 环境配置
- oneapi 2021.1-beta08、gcc 10
- manjaro 20
- CPU: i7-6700hp
- 显卡:gtx965m
- 内存:2133 16G
# 串行
写了两个版本,主要是想对比多一个序列存储随机值对速度的影响
# 版本 0
double estimate_pi_0(size_t n_points) {
double estimated_pi; // Estimated value of Pi
size_t n_under_curve = 0; // Number of points fallen under the curve
// Allocate storage for random numbers
double x = 0;
double y = 0;
// Step 1. Generate n_points random numbers
// & Count the number of points fallen under the curve
// 1.1. Generator initialization
std::random_device rd;
std::default_random_engine engine(rd());
std::uniform_real_distribution<double> distr(0, 1);
// 1.2. Random number generation
for (int i = 0; i < n_points; i++)
{
x = distr(engine);
y = distr(engine);
if (x * x + y * y <= 1.0)
n_under_curve++;
}
// Step 2. Calculate approximated value of Pi
estimated_pi = n_under_curve * 4.0/ (double)n_points;
return estimated_pi;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 版本 1
double estimate_pi_1(size_t n_points) {
double estimated_pi; // Estimated value of Pi
size_t n_under_curve = 0; // Number of points fallen under the curve
// Allocate storage for random numbers
std::vector<double> x(n_points);
std::vector<double> y(n_points);
// Step 1. Generate n_points random numbers
// 1.1. Generator initialization
std::random_device rd;
std::default_random_engine engine(rd());
std::uniform_real_distribution<double> distr(0, 1);
// 1.2. Random number generation
for(int i = 0; i < n_points; i++) {
x[i] = distr(engine);
y[i] = distr(engine);
}
// Step 2. Count the number of points fallen under the curve
for ( int i = 0; i < n_points; i++ ) {
if (x[i] * x[i] + y[i] * y[i] <= 1.0f)
n_under_curve++;
}
// Step 3. Calculate approximated value of Pi
estimated_pi = n_under_curve * 4.0/ (double)n_points;
return estimated_pi;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 并行
# OpenMP
double estimate_pi_openmp(size_t n_points) {
double x = 0;
double y = 0;
uint64_t n_under_curve = { 0 };
#pragma omp parallel num_threads(4)
{
// 随机数生成
std::random_device rd;
std::mt19937_64 gen(rd());
std::uniform_real_distribution<double> dis(0.0, 1.0);
#pragma omp for reduction(+:n_under_curve) private(x, y)
for (size_t j = 0; j < n_points; j++)
{
x = dis(gen);
y = dis(gen);
if (hypot(x, y) < 1.0)
n_under_curve++;
}
}
return 4.0 * n_under_curve / (double)n_points;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# with MKL
double estimate_pi_mkl(size_t n_points) {
double estimated_pi; // Estimated value of Pi
size_t n_under_curve = 0; // Number of points fallen under the curve
// Allocate storage for random numbers
cl::sycl::buffer<double, 1> x_buf(cl::sycl::range<1>{n_points});
cl::sycl::buffer<double, 1> y_buf(cl::sycl::range<1>{n_points});
// Choose device to run on and create queue
cl::sycl::gpu_selector selector;
cl::sycl::queue queue(selector);
std::cout << "Running on: " <<
queue.get_device().get_info<cl::sycl::info::device::name>() << " - ";
// Step 1. Generate n_points random numbers
// 1.1. Generator initialization
std::random_device SEED;
mkl::rng::philox4x32x10 engine(queue, SEED());
mkl::rng::uniform<double, mkl::rng::uniform_method::standard> distr(0.0f, 1.0f);
// 1.2. Random number generation
mkl::rng::generate(distr, engine, n_points, x_buf);
mkl::rng::generate(distr, engine, n_points, y_buf);
//Step 2. Count the number of points fallen under the curve
auto x_acc = x_buf.template get_access<cl::sycl::access::mode::read>();
auto y_acc = y_buf.template get_access<cl::sycl::access::mode::read>();
for ( int i = 0; i < n_points; i++ ) {
if (x_acc[i] * x_acc[i] + y_acc[i] * y_acc[i] <= 1.0f)
n_under_curve++;
}
// Step 3. Calculate approximated value of Pi
estimated_pi = n_under_curve / ((double)n_points) * 4.0;
return estimated_pi;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 编译及运行
# 编译
dpcpp -fiopenmp -fsycl -DMKL_ILP64 -lmkl_intel_ilp64 \
-lmkl_sequential -lmkl_core -lmkl_sycl -O3 -std=c++14 \
Pi_OpenMP_OneApi.cpp -o Pi_OpenMP_OneApi.out
1
2
3
2
3
- -fiopenmp : 开启 OpenMP
- -O3 : 启用优化
- -fsycl : 启用 DPC++
- -DMKL_ILP64 : 使用 64 位整数类型(可选)
- 链接库 :
- -lmkl_intel_ilp64
- -lmkl_sequential
- -lmkl_core -lmkl_sycl
# 运行
./Pi_OpenMP_OneApi.out <Number>
1
模拟点数因子:实际计算值为 `Number * 4096 * 128`,4096 和 128 分别为 [此文](Pi_OpenMP_CUDA_OpenACC.md) 中 CUDA 线程块和线程数目预设参数,保持一致便于对比
# 结果展示
$ ./a.out 1
随机点数:524288
Serial_1 : The simulated value of pi: 3.1423873901 Relative error: 0.025297% Takes 90.841303 ms
Serial_2 : The simulated value of pi: 3.1393356323 Relative error: 0.071843% Takes 94.079305 ms
OpenMP : The simulated value of pi: 3.1412429810 Relative error: 0.011130% Takes 32.634521 ms
Intel Mkl : The simulated value of pi: 3.1422576904 Relative error: 0.021169% Takes 239.586461 ms
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
$ ./a.out 10
随机点数:5242880
Serial_1 : The simulated value of pi: 3.1406837463 Relative error: 0.028931% Takes 909.770236 ms
Serial_2 : The simulated value of pi: 3.1408027649 Relative error: 0.025143% Takes 949.799343 ms
OpenMP : The simulated value of pi: 3.1410621643 Relative error: 0.016886% Takes 246.051302 ms
Intel Mkl : The simulated value of pi: 3.1413108826 Relative error: 0.008969% Takes 276.707375 ms
(base)
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
$ ./a.out 100
随机点数:52428800
Serial_1 : The simulated value of pi: 3.1415371704 Relative error: 0.001766% Takes 9159.288490 ms
Serial_2 : The simulated value of pi: 3.1415546417 Relative error: 0.001210% Takes 9863.436424 ms
OpenMP : The simulated value of pi: 3.1416116333 Relative error: 0.000604% Takes 2459.353222 ms
Intel Mkl : The simulated value of pi: 3.1411964417 Relative error: 0.012612% Takes 684.947111 ms
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
# 完整代码
# TODO
- 完善代码
- 添加 MKL 优化 示例
- 添加 USM 示例
- 添加队列示例