Checkpoint-and-restart with DMTCP or tools can create large disk images, and the checkpoint details are controlled by the third-party system. To avoid the problems, the programmers can define their checkpoint-and-restart strategy that better suits the application.
Here is an example of a C program that we would like to add the self-defined checkpoint-and-restart so that the program can periodically save the checkpoint info to a file, and it can restart the execution from the last checkpoint in case an interruption occurs.
#include <iostream>
#include <unistd.h>
void b (int &);
void d (int &);
int main() {
int i1 = 10;
int i2 = 20;
d(i1);
b(i2);
b(i1);
printf("Final i1 = %d\n", i1);
printf("Final i2 = %d\n", i2);
return 1;
}
void b (int & v) {
for(int i =0 ; i < 10; i++) {
v+=i;
printf("Value changed to %d\n", v );
sleep(1);
}
}
void d (int & v) {
for (int i =0; i < 10; i++) {
b(v);
}
}
Here is the sample code with the self-defined checkpoint-and-restart solution:
#include <iostream>
#include <stdio.h>
#include <string>
#include <fstream>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
void b (int &);
void d (int &);
int restart_b=0;
int restart_d=0;
static int rose_count_b=0;
static int rose_count_d=0;
int main(int argc, char *argv[]) {
if(argc > 1 && *argv[1] == 'r'){
restart_b=1;
restart_d=1;
printf("\nThe file will be opened in the restart mode.\n");
}
int i1 = 10;
int i2 = 20;
rose_count_d =0;
rose_count_d =0;
rose_count_d++; //need to be increased by one every time b() gets called
d(i1);
rose_count_b++; //need to be increased by one every time b() gets called
b(i2);
rose_count_b++;
b(i1);
printf("Final i1 = %d\n", i1);
printf("Final i2 = %d\n", i2);
return 1;
}
void b (int & v) {
int rose_start=0;
static int rose_count_in_file=0;
static int rose_start_from_file=0;
static int rose_v_from_file=0;
if(restart_b==1){
char line[80];
FILE *fp2;
fp2 = fopen ("chkpt_b.txt", "r");
if(fgets(line, 80, fp2) != NULL){
sscanf (line, "%d", &rose_count_in_file);
fgets(line, 80, fp2);
sscanf (line, "%d", &rose_start_from_file);
fgets(line, 80, fp2);
sscanf (line, "%d", &rose_v_from_file);
}
fclose(fp2);
restart_b=0;
}
printf ("\nrose_count_in_file: %d, rose_count: %d\n",rose_count_in_file, rose_count_b);
FILE * fp;
if(rose_count_in_file==rose_count_b){
rose_start = rose_start_from_file;
v = rose_v_from_file;
}
for(int i = rose_start ; i < 10; i++) {
if(i==rose_start){
printf("\nstarting out from i= %d\n", i);
}
if(i%5==0){
fp = fopen ("chkpt_b.txt", "w");
//get the data type of the values to be checkpointed as specified by the user
fprintf(fp, "%d\n", rose_count_b);
fprintf(fp, "%d\n", i);
fprintf(fp, "%d\n", v);
fclose(fp);
}
v+=i;
printf("Value changed to %d\n", v );
sleep(1);
}
}
void d (int & v) {
int rose_start;
static int rose_count_in_file=0;
static int rose_start_from_file=0;
static int rose_v_from_file=0;
if(restart_d==1){
char line[80];
FILE *fp2;
fp2 = fopen ("chkpt_d.txt", "r");
if(fgets(line, 80, fp2) != NULL){
sscanf (line, "%d", &rose_count_in_file);
printf ("\nrose_count_in_file: %d\n",rose_count_in_file);
fgets(line, 80, fp2);
sscanf (line, "%d", &rose_start_from_file);
printf ("\nrose_start_from_file: %d\n",rose_start_from_file );
fgets(line, 80, fp2);
sscanf (line, "%d", &rose_v_from_file);
printf ("\nv_of_d_function: %d\n",rose_v_from_file );
}
fclose(fp2);
restart_d=0;
}
FILE* fp;
if(rose_count_in_file==rose_count_d){
rose_start = rose_start_from_file;
v = rose_v_from_file;
}
printf("\nv_D is: %d\n", v);
for(int i = rose_start ; i < 10; i++) {
if(i==rose_start){
printf("\nstarting out from i= %d\n", i);
}
{//Checkpoint must be done before the b() is called. Otherwise, inconsistency occurs
fp = fopen ("chkpt_d.txt", "w");
//get the data type of the values to be checkpointed as specified by the user
fprintf(fp, "%d\n", rose_count_d);
fprintf(fp, "%d\n", i);
fprintf(fp, "%d\n", v);
fclose(fp);
}
rose_count_b++; //needs to be increased by one before the funtion get called, even inside another rountine
b(v);
}
}
Assuming we name the two files sample.cc and sample_checkpoint.cc, we can compile the sample code as below:
module load gcc/7.3.0
g++ sample.cc -o sample
g++ sample_checkpoint.cc -o sample_checkpoint
To run the sample:
$ ./sample
...........
Value changed to 481
Value changed to 488
Value changed to 496
Value changed to 505
Final i1 = 505
Final i2 = 65
Type in ./sample_checkpoint to run the code with the checkpoint solution. Type in control-c to interrupt the execution at some point.
$ ./sample_checkpoint
...........
Value changed to 56
Value changed to 58
Value changed to 61
Value changed to 65
^C
Type in ./sample_checkpoint r to restart the execution from the last checkpoint.
$ ./sample_checkpoint r
..........
Value changed to 56
Value changed to 58
Value changed to 61
Value changed to 65
Value changed to 70
Value changed to 76
----------
Final i1 = 505
Final i2 = 65