Here is a Slurm job script for submit a job with checkpoint feature:

#!/bin/bash
# Put your SLURM options here
#SBATCH --partition=defq # change to proper partition name or remove
#SBATCH --time=00:15:00 # put proper time of reservation here
#SBATCH --nodes=1 # number of nodes
##SBATCH --ntasks-per-node=4 # processes per node
##SBATCH --mem=24000 # memory resource
#SBATCH --job-name="dmtcp_job" # change to your job name
#SBATCH --output=dmtcp.out # change to proper file name or remove for defaults
# ? Any other batch options ?


export DMTCP_DL_PLUGIN=0


#----------------------------- Set up DMTCP environment for a job ------------#


start_coordinator()
{

fname=dmtcp_command.$SLURM_JOBID
h=`hostname`

check_coordinator=`which dmtcp_coordinator`
if [ -z "$check_coordinator" ]; then
echo "No dmtcp_coordinator found. Check your DMTCP installation and PATH settings."
exit 0
fi

dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file $fname $@ 1>/dev/null 2>&1

while true; do
if [ -f "$fname" ]; then
p=`cat $fname`
if [ -n "$p" ]; then
# try to communicate ? dmtcp_command -p $p l
break
fi
fi
done

# Create dmtcp_command wrapper for easy communication with coordinator
p=`cat $fname`
chmod +x $fname
echo "#!/bin/bash" > $fname
echo >> $fname
echo "export PATH=$PATH" >> $fname
echo "export DMTCP_COORD_HOST=$h" >> $fname
echo "export DMTCP_COORD_PORT=$p" >> $fname
echo "dmtcp_command \$@" >> $fname

# Set up local environment for DMTCP
export DMTCP_COORD_HOST=$h
export DMTCP_COORD_PORT=$p

}


# changedir to workdir
cd $SLURM_SUBMIT_DIR


#----------------------------------- Set up job environment ------------------#


. /etc/profile.d/modules.sh
module load dmtcp/2.6.0
#load your other modules here




#------------------------------------- Launch application ---------------------#

################################################################################
# 1. Start DMTCP coordinator
################################################################################

start_coordinator -i 600 # checkpoint every 600 seconds

dmtcp_launch --ckpt-signal 10 ./<app-binary>.

Here is a Slurm job script for restarting a checkpointed job:

#!/bin/bash
# Put your SLURM options here
#SBATCH --partition=gpu # change to proper partition name or remove
#SBATCH --time=00:15:00 # put proper time of reservation here
#SBATCH --nodes=1 # number of nodes
##SBATCH --ntasks-per-node=4 # processes per node
##SBATCH --mem=24000 # memory resource
#SBATCH --job-name="dmtcp_job" # change to your job name
#SBATCH --output=dmtcp.out # change to proper file name or remove for defaults
# ? Any other batch options ?

#SBATCH --ntasks=5

#----------------------------- Set up DMTCP environment for a job ------------#

###############################################################################
# Start DMTCP coordinator on the launching node. Free TCP port is automatically
# allocated. This function creates a dmtcp_command.$JOBID script, which serves
# as a wrapper around dmtcp_command. The script tunes dmtcp_command for the
# exact dmtcp_coordinator (its hostname and port). Instead of typing
# "dmtcp_command -h <coordinator hostname> -p <coordinator port> <command>",
# you just type "dmtcp_command.$JOBID <command>" and talk to the coordinator
# for JOBID job.
###############################################################################

start_coordinator()
{
############################################################
# For debugging when launching a custom coordinator, uncomment
# the following lines and provide the proper host and port for
# the coordinator.
############################################################
# export DMTCP_COORD_HOST=$h
# export DMTCP_COORD_PORT=$p
# return

fname=dmtcp_command.$SLURM_JOBID
h=`hostname`

check_coordinator=`which dmtcp_coordinator`
if [ -z "$check_coordinator" ]; then
echo "No dmtcp_coordinator found. Check your DMTCP installation and PATH settings"
exit 0
fi

dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file $fname $@ 1>/dev/null 2>&1

while true; do
if [ -f "$fname" ]; then
p=`cat $fname`
if [ -n "$p" ]; then
# try to communicate ? dmtcp_command -p $p l
break
fi
fi
done

# Create a dmtcp_command wrapper for easy communication with the coordinator.
p=`cat $fname`
chmod +x $fname
echo "#!/bin/bash" > $fname
echo >> $fname
echo "export PATH=$PATH" >> $fname
echo "export DMTCP_COORD_HOST=$h" >> $fname
echo "export DMTCP_COORD_PORT=$p" >> $fname
echo "dmtcp_command \$@" >> $fname

# Set up local environment for DMTCP
export DMTCP_COORD_HOST=$h
export DMTCP_COORD_PORT=$p

}

#----------------------- Some rutine steps and information output -------------------------#

###################################################################################
# Print out the SLURM job information. Remove this if you don't need it.
###################################################################################
echo "SLURM_JOBID="$SLURM_JOBID
echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
echo "SLURM_NNODES"=$SLURM_NNODES
echo "SLURMTMPDIR="$SLURMTMPDIR
echo "working directory = "$SLURM_SUBMIT_DIR

# changedir to workdir
cd $SLURM_SUBMIT_DIR

#----------------------------------- Set up job environment ------------------#


. /etc/profile.d/modules.sh
module load dmtcp/2.6.0
#load your other modules here


#------------------------------------- Launch application ---------------------#

################################################################################
# 1. Start DMTCP coordinator
################################################################################

start_coordinator -i 600 # 600 means checkpoint every 600 seconds

################################################################################
# 2. Restart application
################################################################################

/bin/bash ./dmtcp_restart_script.sh -h $DMTCP_COORD_HOST -p $DMTCP_COORD_PORT



-- Zhiwei - 12 Jul 2020

This topic: Main > WebHome > ApplicationCheckpointAndRestart > CheckpointAndRestartSequentialAndMulti-threadingBatchJobs
Topic revision: 14 Sep 2020, AdminUser
This site is powered by FoswikiCopyright © by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding Foswiki? Send feedback