oneApi 并行计算 PI 的值

jokervTv 2020/8/31 OpenMPCUDAMPI

环境配置
串行
- 版本 0
- 版本 1
并行
- OpenMP
- with MKL
编译及运行
- 编译
- 运行
结果展示
完整代码
TODO

继上篇文章简单利用了 OpenMP 和 CUDA 计算了 PI 的值，现在利用 oneApi 再试一次。大体思路保持一致。

oneapi 部分代码源于 Intel® oneAPI Math Kernel Library Data Parallel C++ Usage Models (on the Example of Monte Carlo Simulation) (opens new window), 请需知

# 环境配置

oneapi 2021.1-beta08、gcc 10
manjaro 20
CPU: i7-6700hp
显卡：gtx965m
内存：2133 16G

# 串行

写了两个版本，主要是想对比多一个序列存储随机值对速度的影响

# 版本 0

double estimate_pi_0(size_t n_points) {
    double estimated_pi;       // Estimated value of Pi
    size_t n_under_curve = 0; // Number of points fallen under the curve

    // Allocate storage for random numbers
    double x = 0;
    double y = 0;

    // Step 1. Generate n_points random numbers
    //  & Count the number of points fallen under the curve
    // 1.1. Generator initialization
    std::random_device rd;
    std::default_random_engine engine(rd());
    std::uniform_real_distribution<double> distr(0, 1);
    // 1.2. Random number generation
    for (int i = 0; i < n_points; i++)
    {
        x = distr(engine);
        y = distr(engine);
        if (x * x + y * y <= 1.0)
            n_under_curve++;
    }

    // Step 2. Calculate approximated value of Pi
    estimated_pi = n_under_curve * 4.0/ (double)n_points;

    return estimated_pi;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

# 版本 1

double estimate_pi_1(size_t n_points) {
    double estimated_pi;       // Estimated value of Pi
    size_t n_under_curve = 0; // Number of points fallen under the curve

    // Allocate storage for random numbers
    std::vector<double> x(n_points);
    std::vector<double> y(n_points);

    // Step 1. Generate n_points random numbers
    // 1.1. Generator initialization
    std::random_device rd;
    std::default_random_engine engine(rd());
    std::uniform_real_distribution<double> distr(0, 1);
    // 1.2. Random number generation
    for(int i = 0; i < n_points; i++) {
        x[i] = distr(engine);
        y[i] = distr(engine);
    }

    // Step 2. Count the number of points fallen under the curve
    for ( int i = 0; i < n_points; i++ ) {
        if (x[i] * x[i] + y[i] * y[i] <= 1.0f)
            n_under_curve++;
    }

    // Step 3. Calculate approximated value of Pi
    estimated_pi = n_under_curve * 4.0/ (double)n_points;

    return estimated_pi;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

# 并行

# OpenMP

double estimate_pi_openmp(size_t n_points) {

	double x = 0;
	double y = 0;
	uint64_t n_under_curve = { 0 };

#pragma omp parallel num_threads(4)
	{
		// 随机数生成
		std::random_device rd;
		std::mt19937_64 gen(rd());
		std::uniform_real_distribution<double> dis(0.0, 1.0);

#pragma omp for reduction(+:n_under_curve) private(x, y)
		for (size_t j = 0; j < n_points; j++)
		{
			x = dis(gen);
			y = dis(gen);
			if (hypot(x, y) < 1.0)
				n_under_curve++;
		}
	}

	return 4.0 * n_under_curve / (double)n_points;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

# with MKL

double estimate_pi_mkl(size_t n_points) {
    double estimated_pi;          // Estimated value of Pi
    size_t n_under_curve = 0;    // Number of points fallen under the curve

    // Allocate storage for random numbers
    cl::sycl::buffer<double, 1> x_buf(cl::sycl::range<1>{n_points});
    cl::sycl::buffer<double, 1> y_buf(cl::sycl::range<1>{n_points});

    // Choose device to run on and create queue
    cl::sycl::gpu_selector selector;
    cl::sycl::queue queue(selector);

    std::cout << "Running on: " <<
        queue.get_device().get_info<cl::sycl::info::device::name>()  << " - ";
    // Step 1. Generate n_points random numbers
    // 1.1. Generator initialization
    std::random_device SEED;
    mkl::rng::philox4x32x10 engine(queue, SEED());
    mkl::rng::uniform<double, mkl::rng::uniform_method::standard> distr(0.0f, 1.0f);

    // 1.2. Random number generation
    mkl::rng::generate(distr, engine, n_points, x_buf);
    mkl::rng::generate(distr, engine, n_points, y_buf);

    //Step 2. Count the number of points fallen under the curve
    auto x_acc = x_buf.template get_access<cl::sycl::access::mode::read>();
    auto y_acc = y_buf.template get_access<cl::sycl::access::mode::read>();
    for ( int i = 0; i < n_points; i++ ) {
        if (x_acc[i] * x_acc[i] + y_acc[i] * y_acc[i] <= 1.0f)
            n_under_curve++;
    }

    // Step 3. Calculate approximated value of Pi
    estimated_pi = n_under_curve / ((double)n_points) * 4.0;

    return estimated_pi; 
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

# 编译及运行

# 编译

dpcpp -fiopenmp -fsycl -DMKL_ILP64 -lmkl_intel_ilp64 \
      -lmkl_sequential -lmkl_core -lmkl_sycl -O3 -std=c++14 \
      Pi_OpenMP_OneApi.cpp -o Pi_OpenMP_OneApi.out

1
2
3

-fiopenmp : 开启 OpenMP
-O3 : 启用优化
-fsycl : 启用 DPC++
-DMKL_ILP64 : 使用 64 位整数类型（可选）
链接库 :
- -lmkl_intel_ilp64
- -lmkl_sequential
- -lmkl_core -lmkl_sycl

# 运行

./Pi_OpenMP_OneApi.out <Number>

模拟点数因子：实际计算值为 `Number * 4096 * 128`，4096 和 128 分别为 [此文](Pi_OpenMP_CUDA_OpenACC.md) 中 CUDA 线程块和线程数目预设参数，保持一致便于对比

# 结果展示

$ ./a.out 1
           
随机点数：524288

Serial_1        : The simulated value of pi: 3.1423873901  Relative error: 0.025297%  Takes 90.841303 ms
Serial_2        : The simulated value of pi: 3.1393356323  Relative error: 0.071843%  Takes 94.079305 ms
OpenMP          : The simulated value of pi: 3.1412429810  Relative error: 0.011130%  Takes 32.634521 ms
Intel Mkl       : The simulated value of pi: 3.1422576904  Relative error: 0.021169%  Takes 239.586461 ms

1
2
3
4
5
6
7
8

$ ./a.out 10 

随机点数：5242880

Serial_1        : The simulated value of pi: 3.1406837463  Relative error: 0.028931%  Takes 909.770236 ms
Serial_2        : The simulated value of pi: 3.1408027649  Relative error: 0.025143%  Takes 949.799343 ms
OpenMP          : The simulated value of pi: 3.1410621643  Relative error: 0.016886%  Takes 246.051302 ms
Intel Mkl       : The simulated value of pi: 3.1413108826  Relative error: 0.008969%  Takes 276.707375 ms
(base)

1
2
3
4
5
6
7
8
9

$ ./a.out 100

随机点数：52428800

Serial_1        : The simulated value of pi: 3.1415371704  Relative error: 0.001766%  Takes 9159.288490 ms
Serial_2        : The simulated value of pi: 3.1415546417  Relative error: 0.001210%  Takes 9863.436424 ms
OpenMP          : The simulated value of pi: 3.1416116333  Relative error: 0.000604%  Takes 2459.353222 ms
Intel Mkl       : The simulated value of pi: 3.1411964417  Relative error: 0.012612%  Takes 684.947111 ms

1
2
3
4
5
6
7
8

# 完整代码

点此下载

# TODO

完善代码
添加 MKL 优化示例
添加 USM 示例
添加队列示例

jokervTv 爬坑历程

Choose mode