赵隐剑老师_并行编程入门实践课程 03¶
- 该笔记对应以下 b 站视频,主要讲在粒子-粒子方法中应用 openmp + mpi
粒子-粒子方法¶
推导以及无量纲化¶
- 库仑定律 (Coulomb's Law)
\[
\vec{F}_{12} = \frac{1}{4 \pi \epsilon_0} \frac{q_1 q_2}{|\vec{r}_{12}|^2} \hat{r}_{12}
\]
其中:
\[
\vec{r}_{12} = \vec{r}_1 - \vec{r}_2, \quad \hat{r}_{12} = \frac{\vec{r}_{12}}{|\vec{r}_{12}|}
\]
\[
\vec{r}_1 = (x_1, y_1, z_1), \quad \vec{r}_2 = (x_2, y_2, z_2)
\]
- 电场与力的关系
\[
F = \vec{E} q \quad \Rightarrow \quad \vec{E}_{12} = \frac{1}{4 \pi \epsilon_0} \frac{q_2}{|\vec{r}_{12}|^2} \hat{r}_{12}
\]
- 多粒子系统 (Many Particles)
对于粒子 \(j\) 对粒子\(i\)的作用:
\[
\vec{E}_{ij} = \frac{1}{4 \pi \epsilon_0} \frac{q_j}{|\vec{r}_{ij}|^2} \hat{r}_{ij}
\]
总场为:
\[
\vec{E}_i = \sum_{\substack{j=1 \\ j \neq i}}^N \vec{E}_{ij} = \frac{1}{4 \pi \epsilon_0} \sum_{\substack{j=1 \\ j \neq i}}^N \frac{q_j}{|\vec{r}_{ij}|^2} \hat{r}_{ij}
\]
- 宏粒子
对于一个模拟粒子系统,假定体积 \(V\),粒子密度为\(n\)
设权重:
\[
w = \frac{nV}{N}
\]
则:
\[
\vec{E}_i = \frac{1}{4 \pi \epsilon_0} \sum_{\substack{j=1 \\ j \neq i}}^N \frac{w q_j}{|\vec{r}_{ij}|^2} \hat{r}_{ij}
\]
令 \(q_j = q\),并定义:
\[
k_e = \frac{w q}{4 \pi \epsilon_0}
\]
因此:
\[
\vec{E}_i = k_e \sum_{\substack{j=1 \\ j \neq i}}^N \frac{\hat{r}_{ij}}{|\vec{r}_{ij}|^2}
\]
伪代码如下:
do \(i = 1, N\)
do \(j = 1, N\)
更新电场:\(\vec{E}_i \;=\; \vec{E}_i \;+\; k_e \,\dfrac{\hat{r}_{ij}}{\lvert\vec{r}_{ij}\rvert^2}\)
end do
end do
\[
\vec{v}_i^{\,n+1}
= \vec{v}_i^{\,n} + \Delta t \cdot \vec{a}_i
\quad,\quad
\vec{a}_i
= \frac{\vec{E}_i \, e w}{m w}
\]
\[
\vec{x}_i^{\,n+1}
= \vec{x}_i^{\,n} + \Delta t \cdot \vec{v}_i^{\,n+1}
\]
P.S.
\[
\frac{d\vec{x}}{dt} = \vec{v},
\quad
\frac{d\vec{v}}{dt} = \vec{a}
\]
-
无量纲化 (Normalization)
-
使数量级 \(\sim 1\),避免过大或过小的数。
- 使一些量 \(=1\),从计算中省去。
SI Base Units:
- Mass = kg
- Charge = C
- Time = s
- Length = m
\(M' = 1,\quad q' = 1,\quad \Delta t' = 1,\quad k_e' = 1\)
- 长度无量纲化推导
\(k_e = \frac{w q}{4 \pi \varepsilon_0}\)
\(k_e' = \frac{w q'}{4 \pi \varepsilon_0} = \frac{w \frac{q}{\varepsilon_0}}{4 \pi \varepsilon_0 / (C^2 T^2 M^{-1} L^{-3})}\)
因此,
\[
1 = k_e' = \frac{w}{4 \pi \varepsilon_0} C^2 T^2 M^{-1} L^{-3}
\]
令:
\[
L^3 = \frac{w C^2 T^2 M^{-1}}{4 \pi \varepsilon_0}
\]
解得:
\[
L = \sqrt[3]{\frac{w C^2 T^2}{4 \pi \varepsilon_0 M}}
\]
- python 代码
import numpy as np
import scipy.constants as sc
ep0 = sc.epsilon_0
pi = sc.pi
e = sc.e
print("ep0, pi, e:")
print(ep0, pi, e)
dt = 1.5e-11 # s
n0 = 5.0e16 # m^-3
dd = 0.1e-3 # m
# Mass in kg
me = sc.m_e
# Domain size
LL = 64 * dd
# Domain volume
VV = LL**3
# Number of particles
N = 50000
# Particle weight
w0 = VV * n0 / N
print("w0:", w0)
# Charge
qe = -e
# 这里把电子温度设为 10 eV,
# 若要表达成“10 eV 对应多少 J”,
# 则 1 eV = sc.e (J),
# 所以 Te = 10 eV => Te (in J) = 10 * sc.e
Te = 10.0 * sc.e # 10 eV in Joules
# 热速度 (thermal velocity)
vte = np.sqrt(Te * sc.Boltzmann / me) # m/s
# 等离子体频率
wpe = np.sqrt(n0 * sc.e**2 / me / sc.epsilon_0)
# 可能是德拜长度 (lambda_D)
lmd = np.sqrt(sc.epsilon_0 * Te * sc.Boltzmann / n0 / sc.e / sc.e)
# 正进行“无量纲化”的定义 (Normalization)
# Charge
C = e
# Mass
M = me
# Time
T = dt
# Length
#(如果想直接取 dd 为长度单位,可改)
#L = dd
# 这里根据截图似乎用了某种立方根关系
L = np.cbrt(w0 * C**2 * T**2 / (4 * pi * ep0 * M))
# Velocity
V = L / T
print("C, M, T, L, V")
print(C, M, T, L, V)
# 归一化后的 eps0
ep0_ = ep0 / (C**2 * T * M / (L**3))
print("ep0_ =", ep0_)
# A^2·s^4·kg^-1·m^-3
# = C^2·s^2·kg^-2·m^-3
# 归一化后的电荷
e_ = e / C
print("e_ =", e_)
# 归一化后的 k_e
ke_ = w0 * e_ / (4 * pi * ep0_)
print("ke_ =", ke_)
print("vte =", vte / V)
print("n0 =", n0 * (L**3))
print("lmd =", lmd / L)
print("LL =", LL / L)
fortran code¶
- 子程序
subroutine sub_ppl(n, p1, p2, p, c, mpi_n)
use mpi
use omp_lib
implicit none
integer :: n, p1, p2
real, dimension(1:9, 1:n) :: p ! 9 x n array
real :: c
integer :: mpi_n
integer :: i, j
real :: r(1:3), rm
! Begin OpenMP parallel region
!$omp parallel default(firstprivate) shared(p)
!$omp do
do i = p1, p2
do j = 1, n
if (j == i) cycle
! Compute the 3D difference vector r = p(1:3,i) - p(1:3,j)
r(1:3) = p(1:3, i) - p(1:3, j)
! Calculate the distance rm = sqrt(r(1)^2 + r(2)^2 + r(3)^2)
rm = sqrt(r(1)*r(1) + r(2)*r(2) + r(3)*r(3))
! If distance <= c, skip
if (rm <= c) cycle
! Update p(7:9, i) by adding r/rm^3
p(7:9, i) = p(7:9, i) + r(1:3) / (rm*rm*rm)
end do
! Update p(4:6,i) by adding p(7:9,i)
p(4:6, i) = p(4:6, i) + p(7:9, i)
! Update p(1:3,i) by adding p(4:6,i)
p(1:3, i) = p(1:3, i) + p(4:6, i)
end do
!$omp end do
! If running in MPI with more than 1 process, zero out the edges
if (mpi_n > 1) then
!$omp workshare
p(1:9, 1:p1-1) = 0.0
p(1:9, p2+1:n) = 0.0
!$omp end workshare
end if
!$omp end parallel
end subroutine sub_ppl
- 主程序
! File: main.f90
#include "mod_pp.f90"
program main
use mpi
use omp_lib
use mod_pp ! Module that presumably contains sub_ppl or related definitions
implicit none
integer :: n, np
real, dimension(:,:), allocatable :: p, pb
real :: t, c, l, vt
integer :: ierr, mpi_i, mpi_n
integer :: p1, p2, i, j
! Initialize MPI
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, mpi_i, ierr)
call mpi_comm_size(mpi_comm_world, mpi_n, ierr)
! Number of particles
n = 50000
! Cutoff length
c = 4.2687653380048194
! Initial domain length
l = 259.8651820816571
! Thermal velocity
vt = 0.8077371891833447
! Allocate and initialize arrays
allocate(p(1:9, 1:n)); p = 0.0
allocate(pb(1:9, 1:n)); pb = 0.0
! On rank 0 only, generate some random initial conditions
if (mpi_i == 0) then
call random_number(p(1:6, :)) ! Fill rows 1..6 of p with random values
p(1:3, :) = p(1:3, :)*l ! Scale positions by domain length
p(4:6, :) = p(4:6, :)-0.5*2.0*vt ! Shift velocities by ±(0.5 * 2.0 * vt)
end if
! Broadcast p from rank 0 to all other ranks
call mpi_bcast(p, size(p), mpi_double_precision, 0, mpi_comm_world, ierr)
! Split the range of particles among MPI ranks
np = int(real(n) / real(mpi_n))
p1 = mpi_i*np + 1
p2 = p1 + np - 1
if (mpi_i == mpi_n - 1) then
p2 = n
end if
! Synchronize all ranks, then start timing
call mpi_barrier(mpi_comm_world, ierr)
t = omp_get_wtime()
! Zero out the 7..9 rows of the p array
p(7:9, :) = 0.0
! Call the subroutine that does the main work
call sub_ppl(n, p1, p2, p, c, mpi_n)
! If there is more than one MPI rank, do an Allreduce to combine results into pb
if (mpi_n > 1) then
call mpi_allreduce(p, pb, size(p), mpi_double_precision, mpi_sum, mpi_comm_world, ierr)
p = pb
end if
! Stop timing
call mpi_barrier(mpi_comm_world, ierr)
t = omp_get_wtime() - t
! Print time from rank 0
if (mpi_i == 0) write(*,*) t
! Finalize MPI
call mpi_finalize(ierr)
end program main
cpp code¶
#include <iostream>
#include <vector>
#include <array>
#include <cmath>
#ifdef _OPENMP
#include <omp.h>
#endif
/**
* @brief Modern C++ version of the Fortran subroutine sub_ppl.
*
* @param n 粒子数。The size of the second dimension of p (number of columns).
* @param p1 Starting index (1-based in Fortran).
* @param p2 Ending index (1-based in Fortran).
* @param p 9 x n data array (in C++, stored as a vector of 9-element arrays).
* 1~3: x,y,z 4~6: vx,vy,vz, 7~9: ax, ay, az
* @param c The cutoff distance. 粒子之间太接近就忽略作用力。
* @param mpi_n The number of MPI processes (used to conditionally reset certain data).
*/
void sub_ppl(int n,
int p1,
int p2,
std::vector<std::array<double, 9>>& p,
double c,
int mpi_n)
{
// Fortran uses 1-based indexing. In C++, it's 0-based. So we map:
// Fortran i in [p1, p2] -> C++ i in [p1-1, p2-1]
// Fortran j in [1, n] -> C++ j in [0, n-1]
//
// p[k][0..8] in C++ corresponds to p(1..9,k+1) in Fortran (just reversed dimension order).
// So p[i][0] = p(1, i+1), p[i][1] = p(2, i+1), ... p[i][8] = p(9, i+1).
// OpenMP parallel region (equivalent to !$omp parallel ...)
#pragma omp parallel default(none) \
shared(p, n, p1, p2, c, mpi_n) \
firstprivate(/* no large copies, just scalar captures here */)
{
// Parallel for (equivalent to !$omp do in Fortran)
#pragma omp for
for (int i = p1 - 1; i < p2; ++i)
{
// For each i, loop over j from 0 to n-1
for (int j = 0; j < n; ++j)
{
// Skip if j == i (matching "if (j == i) cycle")
if (j == i)
continue;
// r(1:3) = p(1:3, i) - p(1:3, j)
// We'll store this difference in a small local array r[0..2].
std::array<double, 3> r {
p[i][0] - p[j][0], // x-component
p[i][1] - p[j][1], // y-component
p[i][2] - p[j][2] // z-component
};
// rm = sqrt(r(1)^2 + r(2)^2 + r(3)^2)
double rm = std::sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);
// if (rm <= c) cycle
if (rm <= c)
continue;
// p(7:9, i) = p(7:9, i) + r(1:3) / (rm^3)
// In C++: p[i][6..8] += r[0..2]/(rm^3)
double inv_rm3 = 1.0 / (rm * rm * rm);
p[i][6] += r[0] * inv_rm3;
p[i][7] += r[1] * inv_rm3;
p[i][8] += r[2] * inv_rm3;
}
// p(4:6, i) = p(4:6, i) + p(7:9, i)
// So p[i][3..5] += p[i][6..8]
p[i][3] += p[i][6];
p[i][4] += p[i][7];
p[i][5] += p[i][8];
// p(1:3, i) = p(1:3, i) + p(4:6, i)
// So p[i][0..2] += p[i][3..5]
p[i][0] += p[i][3];
p[i][1] += p[i][4];
p[i][2] += p[i][5];
}
// if (mpi_n > 1) then
// p(1:9, 1:p1-1) = 0.0
// p(1:9, p2+1:n) = 0.0
// end if
if (mpi_n > 1)
{
// We can do two separate parallel loops if desired:
#pragma omp for
for (int idx = 0; idx < p1 - 1; ++idx)
{
for (int row = 0; row < 9; ++row)
{
p[idx][row] = 0.0;
}
}
#pragma omp for
for (int idx = p2; idx < n; ++idx)
{
for (int row = 0; row < 9; ++row)
{
p[idx][row] = 0.0;
}
}
}
} // end of parallel region
}
- 主程序
#include <mpi.h> // For MPI
#include <omp.h> // For omp_get_wtime (optional)
#include <cmath>
#include <vector>
#include <array>
#include <random>
#include <iostream>
#include <algorithm>
// Forward declaration of our modern C++ sub_ppl function (logic equivalent to Fortran).
// For demonstration, let's assume it takes the same parameters as the Fortran subroutine.
void sub_ppl(int n, int p1, int p2,
std::vector<std::array<double, 9>>& p,
double c, int mpi_n);
// Helper function for convenience: number of elements in 2D layout [9 x n].
static inline int totalSize2D(int rows, int cols) {
return rows * cols;
}
int main(int argc, char* argv[])
{
// -------------------------------------------------------------------------
// MPI initialization
// -------------------------------------------------------------------------
MPI_Init(&argc, &argv);
int mpi_i = 0; // rank (MPI_i in Fortran)
int mpi_n = 1; // size (MPI_n in Fortran)
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_i);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_n);
// -------------------------------------------------------------------------
// Problem parameters
// -------------------------------------------------------------------------
int n = 50000; // Number of particles
double c = 4.2687653380048194; // Cutoff distance
double l = 259.8651820816571; // Domain length
double vt = 0.8077371891833447; // Thermal velocity
// We'll store p and pb as vectors of length n, each containing 9 doubles:
// p[i][0..8] corresponds to Fortran p(1..9, i+1)
// Indices: 0-based in C++ vs. 1-based in Fortran.
std::vector<std::array<double, 9>> p(n), pb(n);
// Initialize them all to zero
for (int i = 0; i < n; ++i) {
std::fill(p[i].begin(), p[i].end(), 0.0);
std::fill(pb[i].begin(), pb[i].end(), 0.0);
}
// -------------------------------------------------------------------------
// Random init on rank 0 only
// (Equivalent to "call random_number(p(1:6,:))" and subsequent scaling)
// -------------------------------------------------------------------------
if (mpi_i == 0)
{
// We only fill rows 1..6 in Fortran => indices 0..5 in C++
// We'll fill p[i][0..5] with random values
std::mt19937 rng(12345); // Fixed seed for reproducibility
std::uniform_real_distribution<double> dist(0.0, 1.0);
for (int i = 0; i < n; ++i) {
// p(1:3,:) = random * l
// -> p[i][0..2] = random * l
for (int row = 0; row < 3; ++row) {
p[i][row] = dist(rng) * l;
}
// p(4:6,:) = random - 0.5*2.0*vt => random => p[i][3..5]
// Then subtract 0.5 * 2 * vt
for (int row = 3; row < 6; ++row) {
p[i][row] = dist(rng);
// Subtract (0.5 * 2.0 * vt) = vt
p[i][row] -= vt;
}
// p[i][6..8] remain 0.0 at this point
}
}
// -------------------------------------------------------------------------
// Broadcast p from rank 0 to all others
// (Equivalent to call mpi_bcast(p, size(p), ...))
// We must pass the raw pointer & size in doubles. Each row has 9 doubles, total n*9.
// -------------------------------------------------------------------------
MPI_Bcast(p.data(),
totalSize2D(9, n), // 9 * n
MPI_DOUBLE,
0,
MPI_COMM_WORLD);
// -------------------------------------------------------------------------
// Determine local range [p1..p2] for each rank
// (Equivalent to np = int(real(n)/real(mpi_n)), etc.)
// Fortran uses 1-based indexing, so adjust carefully in C++.
// -------------------------------------------------------------------------
int np = static_cast<int>( static_cast<double>(n) / static_cast<double>(mpi_n) );
int p1 = mpi_i * np; // C++ 0-based start
int p2 = p1 + np - 1; // inclusive end
if (mpi_i == mpi_n - 1) {
p2 = n - 1; // last rank covers up to n-1
}
// -------------------------------------------------------------------------
// Synchronize ranks, measure time
// -------------------------------------------------------------------------
MPI_Barrier(MPI_COMM_WORLD);
double t_start = omp_get_wtime(); // or use std::chrono if desired
// -------------------------------------------------------------------------
// Zero out columns 7..9 => p(*, 6..8) in C++
// (Equivalent to p(7:9,:) = 0.0 in Fortran)
// -------------------------------------------------------------------------
for (int i = 0; i < n; ++i) {
p[i][6] = 0.0;
p[i][7] = 0.0;
p[i][8] = 0.0;
}
// -------------------------------------------------------------------------
// Call our "sub_ppl" routine
// (Equivalent to call sub_ppl(n, p1, p2, p, c, mpi_n))
// -------------------------------------------------------------------------
sub_ppl(n, p1, p2, p, c, mpi_n);
// -------------------------------------------------------------------------
// If we have multiple ranks, do an MPI_Allreduce to combine partial results
// (Equivalent to if(mpi_n>1) then ... call mpi_allreduce ...)
// -------------------------------------------------------------------------
if (mpi_n > 1)
{
// pb = sum of p across all ranks
// We'll do an Allreduce of (n*9) doubles from p into pb
MPI_Allreduce(p.data(),
pb.data(),
totalSize2D(9, n),
MPI_DOUBLE,
MPI_SUM,
MPI_COMM_WORLD);
// Then p = pb
p = pb;
}
// -------------------------------------------------------------------------
// Stop timing
// -------------------------------------------------------------------------
MPI_Barrier(MPI_COMM_WORLD);
double t_elapsed = omp_get_wtime() - t_start;
// Print time from rank 0
if (mpi_i == 0) {
std::cout << t_elapsed << std::endl;
}
// Finalize MPI
MPI_Finalize();
return 0;
}