Configuration
Geant4 Version:11.2.1
Operating System:Centos 7
Compiler/Version: openmpi 3.1.0 with gcc 10.2.1
CMake Version:3.25.3
Problem
- One node: its ok!
- two node:
[cu02][[7230,1],52][btl_tcp_endpoint.c:625:mca_btl_tcp_endpoint_recv_connect_ack] received unexpected process identifier [[7230,1],100]
Scripts
#!/bin/bash
#PBS -N g4mpi_parallel_test
#PBS -l nodes=cu01:ppn=104+cu02:ppn=104
#PBS -l walltime=04:00:00
#PBS -o geant4_mpi_batch_scaling.out
#PBS -e geant4_mpi_batch_scaling.err
#PBS -q batch
#PBS -V
set -e
cd $PBS_O_WORKDIR
module purge
module load cmake/3.25.3
module load geant4/11.2.1/mpi-3.1.0
cmake -S . -B build
cmake --build build
cat $PBS_NODEFILE > hostfile
# =========================
# test parallel
# =========================
COMBINATIONS=(
"1 52"
"1 104"
"2 52"
"2 104"
)
# =========================
# foreach core and thread
# =========================
for combo in "${COMBINATIONS[@]}"; do
read -r NODE CORE <<< "$combo"
TOTAL_CORE=$(( NODE * CORE ))
if [ "$CORE" -eq 52 ]; then
BIND_OPTION="--bind-to core"
else
BIND_OPTION="--bind-to hwthread"
fi
echo "======== Testing: ${NODE} nodes × ${CORE} cores ========"
LOGFILE="run_${NODE}n_${CORE}c.log"
/usr/bin/time -v mpirun -np $TOTAL_CORE --map-by ppr:${CORE}:node:PE=1 $BIND_OPTION --hostfile hostfile --rank-by core build/exMPI01 build/run.mac > $LOGFILE 2>&1
echo "======== ENDING: ${NODE} nodes × ${CORE} core ========="
done