Here is a Slurm job script for submit a job with checkpoint feature:
#!/bin/bash
# Put your SLURM options here
#SBATCH --partition=defq # change to proper partition name or remove
#SBATCH --time=00:15:00 # put proper time of reservation here
#SBATCH --nodes=1 # number of nodes
##SBATCH --ntasks-per-node=4 # processes per node
##SBATCH --mem=24000 # memory resource
#SBATCH --job-name="dmtcp_job" # change to your job name
#SBATCH --output=dmtcp.out # change to proper file name or remove for defaults
# ? Any other batch options ?
export DMTCP_DL_PLUGIN=0
#----------------------------- Set up DMTCP environment for a job ------------#
start_coordinator()
{
fname=dmtcp_command.$SLURM_JOBID
h=`hostname`
check_coordinator=`which dmtcp_coordinator`
if [ -z "$check_coordinator" ]; then
echo "No dmtcp_coordinator found. Check your DMTCP installation and PATH settings."
exit 0
fi
dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file $fname $@ 1>/dev/null 2>&1
while true; do
if [ -f "$fname" ]; then
p=`cat $fname`
if [ -n "$p" ]; then
# try to communicate ? dmtcp_command -p $p l
break
fi
fi
done
# Create dmtcp_command wrapper for easy communication with coordinator
p=`cat $fname`
chmod +x $fname
echo "#!/bin/bash" > $fname
echo >> $fname
echo "export PATH=$PATH" >> $fname
echo "export DMTCP_COORD_HOST=$h" >> $fname
echo "export DMTCP_COORD_PORT=$p" >> $fname
echo "dmtcp_command \$@" >> $fname
# Set up local environment for DMTCP
export DMTCP_COORD_HOST=$h
export DMTCP_COORD_PORT=$p
}
# changedir to workdir
cd $SLURM_SUBMIT_DIR
#----------------------------------- Set up job environment ------------------#
. /etc/profile.d/modules.sh
module load dmtcp/2.6.0
#load your other modules here
#------------------------------------- Launch application ---------------------#
################################################################################
# 1. Start DMTCP coordinator
################################################################################
start_coordinator -i 600 # checkpoint every 600 seconds
dmtcp_launch --ckpt-signal 10 ./<app-binary>.
Here is a Slurm job script for restarting a checkpointed job:
#!/bin/bash
# Put your SLURM options here
#SBATCH --partition=gpu # change to proper partition name or remove
#SBATCH --time=00:15:00 # put proper time of reservation here
#SBATCH --nodes=1 # number of nodes
##SBATCH --ntasks-per-node=4 # processes per node
##SBATCH --mem=24000 # memory resource
#SBATCH --job-name="dmtcp_job" # change to your job name
#SBATCH --output=dmtcp.out # change to proper file name or remove for defaults
# ? Any other batch options ?
#SBATCH --ntasks=5
#----------------------------- Set up DMTCP environment for a job ------------#
###############################################################################
# Start DMTCP coordinator on the launching node. Free TCP port is automatically
# allocated. This function creates a dmtcp_command.$JOBID script, which serves
# as a wrapper around dmtcp_command. The script tunes dmtcp_command for the
# exact dmtcp_coordinator (its hostname and port). Instead of typing
# "dmtcp_command -h <coordinator hostname> -p <coordinator port> <command>",
# you just type "dmtcp_command.$JOBID <command>" and talk to the coordinator
# for JOBID job.
###############################################################################
start_coordinator()
{
############################################################
# For debugging when launching a custom coordinator, uncomment
# the following lines and provide the proper host and port for
# the coordinator.
############################################################
# export DMTCP_COORD_HOST=$h
# export DMTCP_COORD_PORT=$p
# return
fname=dmtcp_command.$SLURM_JOBID
h=`hostname`
check_coordinator=`which dmtcp_coordinator`
if [ -z "$check_coordinator" ]; then
echo "No dmtcp_coordinator found. Check your DMTCP installation and PATH settings"
exit 0
fi
dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file $fname $@ 1>/dev/null 2>&1
while true; do
if [ -f "$fname" ]; then
p=`cat $fname`
if [ -n "$p" ]; then
# try to communicate ? dmtcp_command -p $p l
break
fi
fi
done
# Create a dmtcp_command wrapper for easy communication with the coordinator.
p=`cat $fname`
chmod +x $fname
echo "#!/bin/bash" > $fname
echo >> $fname
echo "export PATH=$PATH" >> $fname
echo "export DMTCP_COORD_HOST=$h" >> $fname
echo "export DMTCP_COORD_PORT=$p" >> $fname
echo "dmtcp_command \$@" >> $fname
# Set up local environment for DMTCP
export DMTCP_COORD_HOST=$h
export DMTCP_COORD_PORT=$p
}
#----------------------- Some rutine steps and information output -------------------------#
###################################################################################
# Print out the SLURM job information. Remove this if you don't need it.
###################################################################################
echo "SLURM_JOBID="$SLURM_JOBID
echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
echo "SLURM_NNODES"=$SLURM_NNODES
echo "SLURMTMPDIR="$SLURMTMPDIR
echo "working directory = "$SLURM_SUBMIT_DIR
# changedir to workdir
cd $SLURM_SUBMIT_DIR
#----------------------------------- Set up job environment ------------------#
. /etc/profile.d/modules.sh
module load dmtcp/2.6.0
#load your other modules here
#------------------------------------- Launch application ---------------------#
################################################################################
# 1. Start DMTCP coordinator
################################################################################
start_coordinator -i 600 # 600 means checkpoint every 600 seconds
################################################################################
# 2. Restart application
################################################################################
/bin/bash ./dmtcp_restart_script.sh -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT
-- Zhiwei - 12 Jul 2020