赵隐剑老师_并行编程入门实践课程 03¶

该笔记对应以下 b 站视频，主要讲在粒子-粒子方法中应用 openmp + mpi
- 并行编程入门与实践5

赵隐剑老师_并行编程入门实践课程 03
- 粒子-粒子方法

粒子-粒子方法¶

推导以及无量纲化¶

库仑定律 (Coulomb's Law)

\[ \vec{F}_{12} = \frac{1}{4 \pi \epsilon_0} \frac{q_1 q_2}{|\vec{r}_{12}|^2} \hat{r}_{12} \]

其中：

\[ \vec{r}_{12} = \vec{r}_1 - \vec{r}_2, \quad \hat{r}_{12} = \frac{\vec{r}_{12}}{|\vec{r}_{12}|} \]

\[ \vec{r}_1 = (x_1, y_1, z_1), \quad \vec{r}_2 = (x_2, y_2, z_2) \]

电场与力的关系

\[ F = \vec{E} q \quad \Rightarrow \quad \vec{E}_{12} = \frac{1}{4 \pi \epsilon_0} \frac{q_2}{|\vec{r}_{12}|^2} \hat{r}_{12} \]

多粒子系统 (Many Particles)

对于粒子 \(j\) 对粒子\(i\)的作用：

\[ \vec{E}_{ij} = \frac{1}{4 \pi \epsilon_0} \frac{q_j}{|\vec{r}_{ij}|^2} \hat{r}_{ij} \]

总场为：

\[ \vec{E}_i = \sum_{\substack{j=1 \\ j \neq i}}^N \vec{E}_{ij} = \frac{1}{4 \pi \epsilon_0} \sum_{\substack{j=1 \\ j \neq i}}^N \frac{q_j}{|\vec{r}_{ij}|^2} \hat{r}_{ij} \]

宏粒子

对于一个模拟粒子系统，假定体积 \(V\)，粒子密度为\(n\)

设权重：

\[ w = \frac{nV}{N} \]

则：

\[ \vec{E}_i = \frac{1}{4 \pi \epsilon_0} \sum_{\substack{j=1 \\ j \neq i}}^N \frac{w q_j}{|\vec{r}_{ij}|^2} \hat{r}_{ij} \]

令 \(q_j = q\)，并定义：

\[ k_e = \frac{w q}{4 \pi \epsilon_0} \]

因此：

\[ \vec{E}_i = k_e \sum_{\substack{j=1 \\ j \neq i}}^N \frac{\hat{r}_{ij}}{|\vec{r}_{ij}|^2} \]

伪代码如下：

do \(i = 1, N\)
do \(j = 1, N\)
更新电场：\(\vec{E}_i \;=\; \vec{E}_i \;+\; k_e \,\dfrac{\hat{r}_{ij}}{\lvert\vec{r}_{ij}\rvert^2}\)
end do
end do

\[ \vec{v}_i^{\,n+1} = \vec{v}_i^{\,n} + \Delta t \cdot \vec{a}_i \quad,\quad \vec{a}_i = \frac{\vec{E}_i \, e w}{m w} \]

\[ \vec{x}_i^{\,n+1} = \vec{x}_i^{\,n} + \Delta t \cdot \vec{v}_i^{\,n+1} \]

P.S.

\[ \frac{d\vec{x}}{dt} = \vec{v}, \quad \frac{d\vec{v}}{dt} = \vec{a} \]

无量纲化 (Normalization)
使数量级 \(\sim 1\)，避免过大或过小的数。
使一些量 \(=1\)，从计算中省去。

SI Base Units:
- Mass = kg
- Charge = C
- Time = s
- Length = m

\(M' = 1,\quad q' = 1,\quad \Delta t' = 1,\quad k_e' = 1\)

长度无量纲化推导

\(k_e = \frac{w q}{4 \pi \varepsilon_0}\)

\(k_e' = \frac{w q'}{4 \pi \varepsilon_0} = \frac{w \frac{q}{\varepsilon_0}}{4 \pi \varepsilon_0 / (C^2 T^2 M^{-1} L^{-3})}\)

因此，

\[ 1 = k_e' = \frac{w}{4 \pi \varepsilon_0} C^2 T^2 M^{-1} L^{-3} \]

令：

\[ L^3 = \frac{w C^2 T^2 M^{-1}}{4 \pi \varepsilon_0} \]

解得：

\[ L = \sqrt[3]{\frac{w C^2 T^2}{4 \pi \varepsilon_0 M}} \]

python 代码

import numpy as np
import scipy.constants as sc

ep0 = sc.epsilon_0
pi  = sc.pi
e   = sc.e

print("ep0, pi, e:")
print(ep0, pi, e)

dt = 1.5e-11   # s
n0 = 5.0e16    # m^-3
dd = 0.1e-3    # m

# Mass in kg
me = sc.m_e

# Domain size
LL = 64 * dd

# Domain volume
VV = LL**3

# Number of particles
N = 50000

# Particle weight
w0 = VV * n0 / N
print("w0:", w0)

# Charge
qe = -e

# 这里把电子温度设为 10 eV，
# 若要表达成“10 eV 对应多少 J”，
# 则 1 eV = sc.e (J)，
# 所以 Te = 10 eV => Te (in J) = 10 * sc.e
Te = 10.0 * sc.e   # 10 eV in Joules

# 热速度 (thermal velocity)
vte = np.sqrt(Te * sc.Boltzmann / me)  # m/s
# 等离子体频率
wpe = np.sqrt(n0 * sc.e**2 / me / sc.epsilon_0)

# 可能是德拜长度 (lambda_D)
lmd = np.sqrt(sc.epsilon_0 * Te * sc.Boltzmann / n0 / sc.e / sc.e)

# 正进行“无量纲化”的定义 (Normalization)
# Charge
C = e
# Mass
M = me
# Time
T = dt
# Length
#(如果想直接取 dd 为长度单位，可改)
#L = dd
# 这里根据截图似乎用了某种立方根关系
L = np.cbrt(w0 * C**2 * T**2 / (4 * pi * ep0 * M))
# Velocity
V = L / T

print("C, M, T, L, V")
print(C, M, T, L, V)

# 归一化后的 eps0
ep0_ = ep0 / (C**2 * T * M / (L**3))
print("ep0_ =", ep0_)
#   A^2·s^4·kg^-1·m^-3
# = C^2·s^2·kg^-2·m^-3

# 归一化后的电荷
e_ = e / C
print("e_ =", e_)

# 归一化后的 k_e
ke_ = w0 * e_ / (4 * pi * ep0_)
print("ke_ =", ke_)

print("vte =", vte / V)
print("n0  =", n0 * (L**3))
print("lmd =", lmd / L)
print("LL  =", LL / L)

fortran code¶

子程序

subroutine sub_ppl(n, p1, p2, p, c, mpi_n)
  use mpi
  use omp_lib

  implicit none

  integer          :: n, p1, p2
  real, dimension(1:9, 1:n) :: p  ! 9 x n array
  real             :: c
  integer          :: mpi_n
  integer          :: i, j
  real             :: r(1:3), rm

  ! Begin OpenMP parallel region
  !$omp parallel default(firstprivate) shared(p)
  !$omp do
  do i = p1, p2
     do j = 1, n
        if (j == i) cycle

        ! Compute the 3D difference vector r = p(1:3,i) - p(1:3,j)
        r(1:3) = p(1:3, i) - p(1:3, j)

        ! Calculate the distance rm = sqrt(r(1)^2 + r(2)^2 + r(3)^2)
        rm = sqrt(r(1)*r(1) + r(2)*r(2) + r(3)*r(3))

        ! If distance <= c, skip
        if (rm <= c) cycle

        ! Update p(7:9, i) by adding r/rm^3
        p(7:9, i) = p(7:9, i) + r(1:3) / (rm*rm*rm)
     end do

     ! Update p(4:6,i) by adding p(7:9,i)
     p(4:6, i) = p(4:6, i) + p(7:9, i)

     ! Update p(1:3,i) by adding p(4:6,i)
     p(1:3, i) = p(1:3, i) + p(4:6, i)
  end do
  !$omp end do

  ! If running in MPI with more than 1 process, zero out the edges
  if (mpi_n > 1) then
    !$omp workshare
    p(1:9, 1:p1-1) = 0.0
    p(1:9, p2+1:n) = 0.0
    !$omp end workshare
  end if

  !$omp end parallel
end subroutine sub_ppl

主程序

! File: main.f90

#include "mod_pp.f90"

program main
  use mpi
  use omp_lib
  use mod_pp  ! Module that presumably contains sub_ppl or related definitions

  implicit none

  integer :: n, np
  real, dimension(:,:), allocatable :: p, pb
  real :: t, c, l, vt
  integer :: ierr, mpi_i, mpi_n
  integer :: p1, p2, i, j

  ! Initialize MPI
  call mpi_init(ierr)
  call mpi_comm_rank(mpi_comm_world, mpi_i, ierr)
  call mpi_comm_size(mpi_comm_world, mpi_n, ierr)

  ! Number of particles
  n = 50000

  ! Cutoff length
  c = 4.2687653380048194

  ! Initial domain length
  l = 259.8651820816571

  ! Thermal velocity
  vt = 0.8077371891833447

  ! Allocate and initialize arrays
  allocate(p(1:9, 1:n));  p = 0.0
  allocate(pb(1:9, 1:n)); pb = 0.0

  ! On rank 0 only, generate some random initial conditions
  if (mpi_i == 0) then
    call random_number(p(1:6, :))        ! Fill rows 1..6 of p with random values
    p(1:3, :) = p(1:3, :)*l             ! Scale positions by domain length
    p(4:6, :) = p(4:6, :)-0.5*2.0*vt    ! Shift velocities by ±(0.5 * 2.0 * vt)
  end if

  ! Broadcast p from rank 0 to all other ranks
  call mpi_bcast(p, size(p), mpi_double_precision, 0, mpi_comm_world, ierr)

  ! Split the range of particles among MPI ranks
  np = int(real(n) / real(mpi_n))
  p1 = mpi_i*np + 1
  p2 = p1 + np - 1
  if (mpi_i == mpi_n - 1) then
    p2 = n
  end if

  ! Synchronize all ranks, then start timing
  call mpi_barrier(mpi_comm_world, ierr)
  t = omp_get_wtime()

  ! Zero out the 7..9 rows of the p array
  p(7:9, :) = 0.0

  ! Call the subroutine that does the main work
  call sub_ppl(n, p1, p2, p, c, mpi_n)

  ! If there is more than one MPI rank, do an Allreduce to combine results into pb
  if (mpi_n > 1) then
    call mpi_allreduce(p, pb, size(p), mpi_double_precision, mpi_sum, mpi_comm_world, ierr)
    p = pb
  end if

  ! Stop timing
  call mpi_barrier(mpi_comm_world, ierr)
  t = omp_get_wtime() - t

  ! Print time from rank 0
  if (mpi_i == 0) write(*,*) t

  ! Finalize MPI
  call mpi_finalize(ierr)

end program main

cpp code¶

#include <iostream>
#include <vector>
#include <array>
#include <cmath>
#ifdef _OPENMP
  #include <omp.h>
#endif

/**
 * @brief Modern C++ version of the Fortran subroutine sub_ppl.
 *
 * @param n      粒子数。The size of the second dimension of p (number of columns).
 * @param p1     Starting index (1-based in Fortran).
 * @param p2     Ending index (1-based in Fortran).
 * @param p      9 x n data array (in C++, stored as a vector of 9-element arrays).
         * 1~3: x,y,z 4~6: vx,vy,vz, 7~9: ax, ay, az
 * @param c      The cutoff distance. 粒子之间太接近就忽略作用力。
 * @param mpi_n  The number of MPI processes (used to conditionally reset certain data).
 */
void sub_ppl(int n,
             int p1,
             int p2,
             std::vector<std::array<double, 9>>& p,
             double c,
             int mpi_n)
{
    // Fortran uses 1-based indexing. In C++, it's 0-based. So we map:
    //   Fortran i in [p1, p2]  -> C++ i in [p1-1, p2-1]
    //   Fortran j in [1, n]    -> C++ j in [0, n-1]
    //
    // p[k][0..8] in C++ corresponds to p(1..9,k+1) in Fortran (just reversed dimension order).
    // So p[i][0] = p(1, i+1), p[i][1] = p(2, i+1), ... p[i][8] = p(9, i+1).

    // OpenMP parallel region (equivalent to !$omp parallel ...)
    #pragma omp parallel default(none) \
                         shared(p, n, p1, p2, c, mpi_n) \
                         firstprivate(/* no large copies, just scalar captures here */)
    {
        // Parallel for (equivalent to !$omp do in Fortran)
        #pragma omp for
        for (int i = p1 - 1; i < p2; ++i)
        {
            // For each i, loop over j from 0 to n-1
            for (int j = 0; j < n; ++j)
            {
                // Skip if j == i (matching "if (j == i) cycle")
                if (j == i) 
                    continue;

                // r(1:3) = p(1:3, i) - p(1:3, j)
                // We'll store this difference in a small local array r[0..2].
                std::array<double, 3> r {
                    p[i][0] - p[j][0],  // x-component
                    p[i][1] - p[j][1],  // y-component
                    p[i][2] - p[j][2]   // z-component
                };

                // rm = sqrt(r(1)^2 + r(2)^2 + r(3)^2)
                double rm = std::sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);

                // if (rm <= c) cycle
                if (rm <= c)
                    continue;

                // p(7:9, i) = p(7:9, i) + r(1:3) / (rm^3)
                // In C++: p[i][6..8] += r[0..2]/(rm^3)
                double inv_rm3 = 1.0 / (rm * rm * rm);
                p[i][6] += r[0] * inv_rm3;
                p[i][7] += r[1] * inv_rm3;
                p[i][8] += r[2] * inv_rm3;
            }

            // p(4:6, i) = p(4:6, i) + p(7:9, i)
            // So p[i][3..5] += p[i][6..8]
            p[i][3] += p[i][6];
            p[i][4] += p[i][7];
            p[i][5] += p[i][8];

            // p(1:3, i) = p(1:3, i) + p(4:6, i)
            // So p[i][0..2] += p[i][3..5]
            p[i][0] += p[i][3];
            p[i][1] += p[i][4];
            p[i][2] += p[i][5];
        }

        // if (mpi_n > 1) then
        //   p(1:9, 1:p1-1) = 0.0
        //   p(1:9, p2+1:n) = 0.0
        // end if
        if (mpi_n > 1)
        {
            // We can do two separate parallel loops if desired:
            #pragma omp for
            for (int idx = 0; idx < p1 - 1; ++idx)
            {
                for (int row = 0; row < 9; ++row)
                {
                    p[idx][row] = 0.0;
                }
            }

            #pragma omp for
            for (int idx = p2; idx < n; ++idx)
            {
                for (int row = 0; row < 9; ++row)
                {
                    p[idx][row] = 0.0;
                }
            }
        }
    } // end of parallel region
}

主程序

#include <mpi.h>       // For MPI
#include <omp.h>       // For omp_get_wtime (optional)
#include <cmath>
#include <vector>
#include <array>
#include <random>
#include <iostream>
#include <algorithm>

// Forward declaration of our modern C++ sub_ppl function (logic equivalent to Fortran).
// For demonstration, let's assume it takes the same parameters as the Fortran subroutine.
void sub_ppl(int n, int p1, int p2,
             std::vector<std::array<double, 9>>& p,
             double c, int mpi_n);

// Helper function for convenience: number of elements in 2D layout [9 x n].
static inline int totalSize2D(int rows, int cols) {
    return rows * cols;
}

int main(int argc, char* argv[])
{
    // -------------------------------------------------------------------------
    // MPI initialization
    // -------------------------------------------------------------------------
    MPI_Init(&argc, &argv);

    int mpi_i = 0;   // rank (MPI_i in Fortran)
    int mpi_n = 1;   // size (MPI_n in Fortran)
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_i);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_n);

    // -------------------------------------------------------------------------
    // Problem parameters
    // -------------------------------------------------------------------------
    int n = 50000;                   // Number of particles
    double c  = 4.2687653380048194;  // Cutoff distance
    double l  = 259.8651820816571;   // Domain length
    double vt = 0.8077371891833447;  // Thermal velocity

    // We'll store p and pb as vectors of length n, each containing 9 doubles:
    //   p[i][0..8] corresponds to Fortran p(1..9, i+1)
    // Indices: 0-based in C++ vs. 1-based in Fortran.
    std::vector<std::array<double, 9>> p(n), pb(n);

    // Initialize them all to zero
    for (int i = 0; i < n; ++i) {
        std::fill(p[i].begin(), p[i].end(), 0.0);
        std::fill(pb[i].begin(), pb[i].end(), 0.0);
    }

    // -------------------------------------------------------------------------
    // Random init on rank 0 only
    // (Equivalent to "call random_number(p(1:6,:))" and subsequent scaling)
    // -------------------------------------------------------------------------
    if (mpi_i == 0)
    {
        // We only fill rows 1..6 in Fortran => indices 0..5 in C++
        // We'll fill p[i][0..5] with random values
        std::mt19937 rng(12345);  // Fixed seed for reproducibility
        std::uniform_real_distribution<double> dist(0.0, 1.0);

        for (int i = 0; i < n; ++i) {
            // p(1:3,:) = random * l
            // -> p[i][0..2] = random * l
            for (int row = 0; row < 3; ++row) {
                p[i][row] = dist(rng) * l;
            }
            // p(4:6,:) = random - 0.5*2.0*vt => random => p[i][3..5]
            // Then subtract 0.5 * 2 * vt
            for (int row = 3; row < 6; ++row) {
                p[i][row] = dist(rng);
                // Subtract (0.5 * 2.0 * vt) = vt
                p[i][row] -= vt;
            }
            // p[i][6..8] remain 0.0 at this point
        }
    }

    // -------------------------------------------------------------------------
    // Broadcast p from rank 0 to all others
    // (Equivalent to call mpi_bcast(p, size(p), ...))
    // We must pass the raw pointer & size in doubles. Each row has 9 doubles, total n*9.
    // -------------------------------------------------------------------------
    MPI_Bcast(p.data(), 
              totalSize2D(9, n),  // 9 * n
              MPI_DOUBLE,
              0,
              MPI_COMM_WORLD);

    // -------------------------------------------------------------------------
    // Determine local range [p1..p2] for each rank
    // (Equivalent to np = int(real(n)/real(mpi_n)), etc.)
    // Fortran uses 1-based indexing, so adjust carefully in C++.
    // -------------------------------------------------------------------------
    int np = static_cast<int>( static_cast<double>(n) / static_cast<double>(mpi_n) );
    int p1 = mpi_i * np;          // C++ 0-based start
    int p2 = p1 + np - 1;         // inclusive end
    if (mpi_i == mpi_n - 1) {
        p2 = n - 1;               // last rank covers up to n-1
    }

    // -------------------------------------------------------------------------
    // Synchronize ranks, measure time
    // -------------------------------------------------------------------------
    MPI_Barrier(MPI_COMM_WORLD);
    double t_start = omp_get_wtime();  // or use std::chrono if desired

    // -------------------------------------------------------------------------
    // Zero out columns 7..9 => p(*, 6..8) in C++
    // (Equivalent to p(7:9,:) = 0.0 in Fortran)
    // -------------------------------------------------------------------------
    for (int i = 0; i < n; ++i) {
        p[i][6] = 0.0;
        p[i][7] = 0.0;
        p[i][8] = 0.0;
    }

    // -------------------------------------------------------------------------
    // Call our "sub_ppl" routine
    // (Equivalent to call sub_ppl(n, p1, p2, p, c, mpi_n))
    // -------------------------------------------------------------------------
    sub_ppl(n, p1, p2, p, c, mpi_n);

    // -------------------------------------------------------------------------
    // If we have multiple ranks, do an MPI_Allreduce to combine partial results
    // (Equivalent to if(mpi_n>1) then ... call mpi_allreduce ...)
    // -------------------------------------------------------------------------
    if (mpi_n > 1)
    {
        // pb = sum of p across all ranks
        // We'll do an Allreduce of (n*9) doubles from p into pb
        MPI_Allreduce(p.data(),
                      pb.data(),
                      totalSize2D(9, n),
                      MPI_DOUBLE,
                      MPI_SUM,
                      MPI_COMM_WORLD);

        // Then p = pb
        p = pb;
    }

    // -------------------------------------------------------------------------
    // Stop timing
    // -------------------------------------------------------------------------
    MPI_Barrier(MPI_COMM_WORLD);
    double t_elapsed = omp_get_wtime() - t_start;

    // Print time from rank 0
    if (mpi_i == 0) {
        std::cout << t_elapsed << std::endl;
    }

    // Finalize MPI
    MPI_Finalize();

    return 0;
}