openmpi  3.1.6
About: Open MPI is a high performance Message Passing Interface (MPI) library project combining technologies and resources from several other projects (FT-MPI, LA-MPI, LAM/MPI, and PACX-MPI) in order to build the best MPI library available. 3.x series.
  Fossies Dox: openmpi-3.1.6.tar.bz2  ("unofficial" and yet experimental doxygen-generated source code documentation)  

opal_cr.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
3  * University Research and Technology
4  * Corporation. All rights reserved.
5  * Copyright (c) 2004-2005 The University of Tennessee and The University
6  * of Tennessee Research Foundation. All rights
7  * reserved.
8  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
9  * University of Stuttgart. All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  * All rights reserved.
12  * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
13  * Copyright (c) 2017 IBM Corporation. All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 
27 #include "opal_config.h"
28 #include "opal/mca/crs/crs.h"
29 #include "opal/mca/event/event.h"
30 #include "opal/util/output.h"
31 #include "opal/prefetch.h"
32 
33 #ifndef OPAL_CR_H
34 #define OPAL_CR_H
35 
36 
38 
39 /*
40  * Some defines shared with opal-[checkpoint|restart] commands
41  */
42 #define OPAL_CR_DONE ((char) 0)
43 #define OPAL_CR_ACK ((char) 1)
44 #define OPAL_CR_CHECKPOINT ((char) 2)
45 #define OPAL_CR_NAMED_PROG_R ("opal_cr_prog_read")
46 #define OPAL_CR_NAMED_PROG_W ("opal_cr_prog_write")
47 #define OPAL_CR_BASE_ENV_NAME ("opal_cr_restart-env")
48 
49 /*
50  * Possible responses to a checkpoint request from opal-checkpoint
51  */
53  OPAL_CHECKPOINT_CMD_START, /* Checkpoint is starting on this request */
54  OPAL_CHECKPOINT_CMD_IN_PROGRESS, /* Checkpoint is currently running */
55  OPAL_CHECKPOINT_CMD_NULL, /* Checkpoint cannot be started because it is not supported */
56  OPAL_CHECKPOINT_CMD_ERROR, /* An error occurred such that the checkpoint cannot be completed */
57  /* State of the checkpoint operation */
58  OPAL_CR_STATUS_NONE, /* No checkpoint in progress */
59  OPAL_CR_STATUS_REQUESTED, /* Checkpoint has been requested */
60  OPAL_CR_STATUS_RUNNING, /* Checkpoint is currently running */
61  OPAL_CR_STATUS_TERM, /* Checkpoint is running and will terminate process upon completion */
62  /* State of the continue operation */
64  /* State of the restart operation */
67 };
69 
70  /* An output handle to be used by the cr runtime
71  * functionality as an argument to opal_output() */
72  OPAL_DECLSPEC extern int opal_cr_output;
73 
74  /* Directory containing the named pipes for communication
75  * with the opal-checkpoint tool */
76  OPAL_DECLSPEC extern char * opal_cr_pipe_dir;
77 
78  /* Signal that opal-checkpoint uses to contact the
79  * application process */
81 
82  /* If Checkpointing is enabled in this application */
84 
85  /* If the application running is a tool
86  * (e.g., opal-checkpoint, orted, ...) */
87  OPAL_DECLSPEC extern bool opal_cr_is_tool;
88 
89  /* If a checkpoint has been requested */
91 
92  /* The current state of a checkpoint operation */
94 
95  /*
96  * If one of the BTLs that shutdown require a full, clean rebuild of the
97  * point-to-point stack on 'continue' as well as 'restart'.
98  */
100 
101 #if OPAL_ENABLE_CRDEBUG == 1
102  /* Whether or not C/R Debugging is enabled for this process */
103  OPAL_DECLSPEC extern int MPIR_debug_with_checkpoint;
104 
105  /*
106  * Set/clear the current thread id for the checkpointing thread
107  */
108  OPAL_DECLSPEC int opal_cr_debug_set_current_ckpt_thread_self(void);
109  OPAL_DECLSPEC int opal_cr_debug_clear_current_ckpt_thread(void);
110 
111  /*
112  * This MPI Debugger function needs to be accessed here and have a specific
113  * name. Thus we are breaking the traditional naming conventions to provide this functionality.
114  */
115  OPAL_DECLSPEC int MPIR_checkpoint_debugger_detach(void);
116 
121  OPAL_DECLSPEC void *MPIR_checkpoint_debugger_breakpoint(void);
122 
126  OPAL_DECLSPEC void *MPIR_checkpoint_debugger_waitpoint(void);
127 
131  OPAL_DECLSPEC void MPIR_checkpoint_debugger_signal_handler(int signo);
132 #endif
133 
134  /*
135  * Refresh environment variables after a restart
136  */
137  OPAL_DECLSPEC int opal_cr_refresh_environ(int prev_pid);
138 
139  /*
140  * If this is an application that doesn't want to have
141  * a notification callback installed, set this to false.
142  * To see the effect, this must be called before opal_cr_init().
143  * Default: Enabled
144  */
146 
151  OPAL_DECLSPEC int opal_cr_init(void);
152 
158 
159 
177 
178  /* If the checkpoint operation should be stalled to
179  * wait for another sevice to complete before
180  * continuing with the checkpoint */
183 
184 #if OPAL_ENABLE_FT_THREAD == 1
185  /* Some thread functions */
186  OPAL_DECLSPEC void opal_cr_thread_init_library(void);
187  OPAL_DECLSPEC void opal_cr_thread_finalize_library(void);
188  OPAL_DECLSPEC void opal_cr_thread_abort_library(void);
189  OPAL_DECLSPEC void opal_cr_thread_enter_library(void);
190  OPAL_DECLSPEC void opal_cr_thread_exit_library(void);
191  OPAL_DECLSPEC void opal_cr_thread_noop_progress(void);
192 #endif /* OPAL_ENABLE_FT_THREAD == 1 */
193 
194  /*
195  * If not using FT then make the #defines noops
196  */
197 #if OPAL_ENABLE_FT == 0 || OPAL_ENABLE_FT_CR == 0
198 #define OPAL_CR_TEST_CHECKPOINT_READY() ;
199 #define OPAL_CR_TEST_CHECKPOINT_READY_STALL() ;
200 #define OPAL_CR_INIT_LIBRARY() ;
201 #define OPAL_CR_FINALIZE_LIBRARY() ;
202 #define OPAL_CR_ABORT_LIBRARY() ;
203 #define OPAL_CR_ENTER_LIBRARY() ;
204 #define OPAL_CR_EXIT_LIBRARY() ;
205 #define OPAL_CR_NOOP_PROGRESS() ;
206 #endif /* #if OPAL_ENABLE_FT == 0 || OPAL_ENABLE_FT_CR == 0 */
207 
208  /*
209  * If using FT
210  */
211 #if OPAL_ENABLE_FT_CR == 1
212 #define OPAL_CR_TEST_CHECKPOINT_READY() \
213  { \
214  if(OPAL_UNLIKELY(opal_cr_is_enabled) ) { \
215  opal_cr_test_if_checkpoint_ready(); \
216  } \
217  }
218 
219 #define OPAL_CR_TEST_CHECKPOINT_READY_STALL() \
220  { \
221  if(OPAL_UNLIKELY(opal_cr_is_enabled && !opal_cr_stall_check)) { \
222  opal_cr_test_if_checkpoint_ready(); \
223  } \
224  }
225 
226 /* If *not* using FT thread */
227 #if OPAL_ENABLE_FT_THREAD == 0
228 #define OPAL_CR_INIT_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
229 #define OPAL_CR_FINALIZE_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
230 #define OPAL_CR_ABORT_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
231 #define OPAL_CR_ENTER_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
232 #define OPAL_CR_EXIT_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
233 #define OPAL_CR_NOOP_PROGRESS() OPAL_CR_TEST_CHECKPOINT_READY();
234 #endif /* OPAL_ENABLE_FT_THREAD == 0 */
235 
236 /* If using FT thread */
237 #if OPAL_ENABLE_FT_THREAD == 1
238 #define OPAL_CR_INIT_LIBRARY() \
239  { \
240  opal_cr_thread_init_library(); \
241  }
242 #define OPAL_CR_FINALIZE_LIBRARY() \
243  { \
244  opal_cr_thread_finalize_library(); \
245  }
246 #define OPAL_CR_ABORT_LIBRARY() \
247  { \
248  opal_cr_thread_abort_library(); \
249  }
250 #define OPAL_CR_ENTER_LIBRARY() \
251  { \
252  opal_cr_thread_enter_library(); \
253  }
254 #define OPAL_CR_EXIT_LIBRARY() \
255  { \
256  opal_cr_thread_exit_library(); \
257  }
258 #define OPAL_CR_NOOP_PROGRESS() \
259  { \
260  opal_cr_thread_noop_progress(); \
261  }
262 #endif /* OPAL_ENABLE_FT_THREAD == 1 */
263 
264 #endif /* OPAL_ENABLE_FT_CR == 1 */
265 
266 
269 
278 
282 
290  opal_crs_base_snapshot_t *snapshot,
292  int *state);
293 
296  opal_crs_base_snapshot_t *snapshot,
298  int *state);
300 
301 
302 
305  typedef enum {
312  OPAL_CR_INC_MAX = 6
314 
315  typedef enum {
321 
327 
331  opal_cr_user_inc_callback_fn_t *prev_function);
332 
335 
336 
337 
343  typedef int (*opal_cr_coord_callback_fn_t) (int);
344 
350  (opal_cr_coord_callback_fn_t new_func,
352 
356  OPAL_DECLSPEC int opal_cr_coord(int state);
357 
361  OPAL_DECLSPEC void opal_cr_set_time(int idx);
364 
369 
370 
371 #define OPAL_CR_TIMER_ENTRY0 0
372 #define OPAL_CR_TIMER_ENTRY1 1
373 #define OPAL_CR_TIMER_ENTRY2 2
374 #define OPAL_CR_TIMER_CRCPBR0 3
375 #define OPAL_CR_TIMER_CRCP0 4
376 #define OPAL_CR_TIMER_CRCPBR1 5
377 #define OPAL_CR_TIMER_P2P0 6
378 #define OPAL_CR_TIMER_P2P1 7
379 #define OPAL_CR_TIMER_P2PBR0 8
380 #define OPAL_CR_TIMER_CORE0 9
381 #define OPAL_CR_TIMER_CORE1 10
382 #define OPAL_CR_TIMER_COREBR0 11
383 #define OPAL_CR_TIMER_P2P2 12
384 #define OPAL_CR_TIMER_P2PBR1 13
385 #define OPAL_CR_TIMER_P2P3 14
386 #define OPAL_CR_TIMER_P2PBR2 15
387 #define OPAL_CR_TIMER_CRCP1 16
388 #define OPAL_CR_TIMER_COREBR1 17
389 #define OPAL_CR_TIMER_CORE2 18
390 #define OPAL_CR_TIMER_ENTRY3 19
391 #define OPAL_CR_TIMER_ENTRY4 20
392 #define OPAL_CR_TIMER_MAX 21
393 
394 
395 #define OPAL_CR_CLEAR_TIMERS() \
396  { \
397  if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
398  opal_cr_clear_timers(); \
399  } \
400  }
401 
402 #define OPAL_CR_SET_TIMER(idx) \
403  { \
404  if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
405  opal_cr_set_time(idx); \
406  } \
407  }
408 
409 #define OPAL_CR_DISPLAY_ALL_TIMERS() \
410  { \
411  if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
412  opal_cr_display_all_timers(); \
413  } \
414  }
415 
417 
418 #endif /* OPAL_CR_H */
419 
OPAL_CR_INC_PRE_CRS_POST_MPI
@ OPAL_CR_INC_PRE_CRS_POST_MPI
Definition: opal_cr.h:307
opal_cr_output
OPAL_DECLSPEC int opal_cr_output
Definition: opal_cr.c:90
opal_cr_test_if_checkpoint_ready
OPAL_DECLSPEC void opal_cr_test_if_checkpoint_ready(void)
Check to see if a checkpoint has been requested.
Definition: opal_cr.c:552
opal_cr_continue_like_restart
OPAL_DECLSPEC bool opal_cr_continue_like_restart
Definition: opal_cr.c:131
opal_cr_ckpt_cmd_state_t
opal_cr_ckpt_cmd_state_t
Definition: opal_cr.h:52
OPAL_CHECKPOINT_CMD_IN_PROGRESS
@ OPAL_CHECKPOINT_CMD_IN_PROGRESS
Definition: opal_cr.h:54
opal_crs_base_snapshot_1_0_0_t
Structure for Single process snapshot Each component is assumed to have extened this definition in th...
Definition: crs.h:110
opal_cr_inc_core_recover
OPAL_DECLSPEC int opal_cr_inc_core_recover(int state)
Definition: opal_cr.c:696
OPAL_CR_INC_STATE_RESTART
@ OPAL_CR_INC_STATE_RESTART
Definition: opal_cr.h:318
opal_cr_notify_callback_fn_t
int(* opal_cr_notify_callback_fn_t)(opal_cr_ckpt_cmd_state_t)
Notification Routines.
Definition: opal_cr.h:277
OPAL_CR_INC_POST_CRS_PRE_MPI
@ OPAL_CR_INC_POST_CRS_PRE_MPI
Definition: opal_cr.h:310
OPAL_CR_INC_STATE_ERROR
@ OPAL_CR_INC_STATE_ERROR
Definition: opal_cr.h:319
opal_cr_user_inc_callback_state_t
opal_cr_user_inc_callback_state_t
Definition: opal_cr.h:315
opal_cr_user_inc_callback_event_t
opal_cr_user_inc_callback_event_t
User Coordination Routines.
Definition: opal_cr.h:305
opal_cr_checkpoint_request
OPAL_DECLSPEC int opal_cr_checkpoint_request
Definition: opal_cr.c:127
opal_cr_is_enabled
OPAL_DECLSPEC bool opal_cr_is_enabled
Definition: opal_cr.c:120
prefetch.h
opal_cr_coord_callback_fn_t
int(* opal_cr_coord_callback_fn_t)(int)
Coordination Routines.
Definition: opal_cr.h:343
opal_cr_display_all_timers
OPAL_DECLSPEC void opal_cr_display_all_timers(void)
Definition: opal_cr.c:1252
opal_cr_is_tool
OPAL_DECLSPEC bool opal_cr_is_tool
Definition: opal_cr.c:121
OPAL_CR_INC_CRS_POST_CKPT
@ OPAL_CR_INC_CRS_POST_CKPT
Definition: opal_cr.h:309
OPAL_CR_INC_CRS_PRE_CKPT
@ OPAL_CR_INC_CRS_PRE_CKPT
Definition: opal_cr.h:308
opal_cr_set_time
OPAL_DECLSPEC void opal_cr_set_time(int idx)
Checkpoint life-cycle timing.
Definition: opal_cr.c:1219
OPAL_CR_STATUS_RESTART_PRE
@ OPAL_CR_STATUS_RESTART_PRE
Definition: opal_cr.h:65
event
Definition: event_struct.h:87
BEGIN_C_DECLS
#define BEGIN_C_DECLS
code that should be in ompi_config_bottom.h regardless of build status
Definition: opal_config_bottom.h:85
OPAL_CR_STATUS_NONE
@ OPAL_CR_STATUS_NONE
Definition: opal_cr.h:58
OPAL_CR_STATUS_RUNNING
@ OPAL_CR_STATUS_RUNNING
Definition: opal_cr.h:60
opal_crs_base_ckpt_options_1_0_0_t
Definition: crs.h:79
opal_cr_refresh_environ
OPAL_DECLSPEC int opal_cr_refresh_environ(int prev_pid)
Definition: opal_cr.c:920
OPAL_CR_INC_POST_CRS_POST_MPI
@ OPAL_CR_INC_POST_CRS_POST_MPI
Definition: opal_cr.h:311
opal_cr_reg_coord_callback
OPAL_DECLSPEC int opal_cr_reg_coord_callback(opal_cr_coord_callback_fn_t new_func, opal_cr_coord_callback_fn_t *prev_func)
Register a checkpoint coodination routine for a higher level.
Definition: opal_cr.c:899
opal_cr_inc_core_ckpt
OPAL_DECLSPEC int opal_cr_inc_core_ckpt(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, int *state)
Definition: opal_cr.c:654
opal_cr_finalize
OPAL_DECLSPEC int opal_cr_finalize(void)
Finalize the notification and coordination elements.
Definition: opal_cr.c:493
output.h
opal_cr_currently_stalled
OPAL_DECLSPEC bool opal_cr_currently_stalled
Definition: opal_cr.c:89
OPAL_CR_INC_MAX
@ OPAL_CR_INC_MAX
Definition: opal_cr.h:312
opal_cr_init
OPAL_DECLSPEC int opal_cr_init(void)
Initialize the notification and coordination elements.
Definition: opal_cr.c:326
prev_func
int * prev_func
Definition: dict_private.h:30
OPAL_CHECKPOINT_CMD_NULL
@ OPAL_CHECKPOINT_CMD_NULL
Definition: opal_cr.h:55
ompi_trigger_user_inc_callback
OPAL_DECLSPEC int ompi_trigger_user_inc_callback(opal_cr_user_inc_callback_event_t event, opal_cr_user_inc_callback_state_t state)
Definition: opal_cr.c:885
OPAL_CR_INC_STATE_PREPARE
@ OPAL_CR_INC_STATE_PREPARE
Definition: opal_cr.h:316
opal_cr_clear_timers
OPAL_DECLSPEC void opal_cr_clear_timers(void)
Definition: opal_cr.c:1228
OPAL_DECLSPEC
#define OPAL_DECLSPEC
Definition: opal_config_bottom.h:253
OPAL_CR_STATUS_RESTART_POST
@ OPAL_CR_STATUS_RESTART_POST
Definition: opal_cr.h:66
opal_cr_timing_enabled
OPAL_DECLSPEC bool opal_cr_timing_enabled
Definition: opal_cr.c:98
event.h
OPAL_CR_INC_PRE_CRS_PRE_MPI
@ OPAL_CR_INC_PRE_CRS_PRE_MPI
Definition: opal_cr.h:306
OPAL_CHECKPOINT_CMD_START
@ OPAL_CHECKPOINT_CMD_START
Definition: opal_cr.h:53
opal_cr_inc_core_prep
OPAL_DECLSPEC int opal_cr_inc_core_prep(void)
Notification Routines.
Definition: opal_cr.c:617
OPAL_CR_INC_STATE_CONTINUE
@ OPAL_CR_INC_STATE_CONTINUE
Definition: opal_cr.h:317
opal_cr_stall_check
OPAL_DECLSPEC bool opal_cr_stall_check
Global Var Decls.
Definition: opal_cr.c:88
OPAL_CR_STATUS_TERM
@ OPAL_CR_STATUS_TERM
Definition: opal_cr.h:61
opal_cr_set_enabled
OPAL_DECLSPEC int opal_cr_set_enabled(bool)
Definition: opal_cr.c:192
opal_cr_coord
OPAL_DECLSPEC int opal_cr_coord(int state)
OPAL Checkpoint Coordination Routine.
Definition: opal_cr.c:799
opal_cr_checkpointing_state
OPAL_DECLSPEC int opal_cr_checkpointing_state
Definition: opal_cr.c:124
opal_cr_timing_my_rank
OPAL_DECLSPEC int opal_cr_timing_my_rank
Definition: opal_cr.c:99
pid
int pid
Definition: prun.c:96
OPAL_CR_STATUS_CONTINUE
@ OPAL_CR_STATUS_CONTINUE
Definition: opal_cr.h:63
crs.h
opal_cr_inc_core
OPAL_DECLSPEC int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_base_ckpt_options_t *options, int *state)
Function to go through the INC.
Definition: opal_cr.c:761
opal_cr_entry_point_signal
OPAL_DECLSPEC int opal_cr_entry_point_signal
Definition: opal_cr.c:119
opal_cr_reg_notify_callback
OPAL_DECLSPEC int opal_cr_reg_notify_callback(opal_cr_notify_callback_fn_t new_func, opal_cr_notify_callback_fn_t *prev_func)
Definition: opal_cr.c:845
opal_cr_user_inc_register_callback
OPAL_DECLSPEC int opal_cr_user_inc_register_callback(opal_cr_user_inc_callback_event_t event, opal_cr_user_inc_callback_fn_t function, opal_cr_user_inc_callback_fn_t *prev_function)
Definition: opal_cr.c:866
opal_cr_timing_barrier_enabled
OPAL_DECLSPEC bool opal_cr_timing_barrier_enabled
Definition: opal_cr.c:97
opal_cr_pipe_dir
OPAL_DECLSPEC char * opal_cr_pipe_dir
Interface Functions & Vars.
Definition: opal_cr.c:118
END_C_DECLS
#define END_C_DECLS
Definition: opal_config_bottom.h:86
opal_cr_user_inc_callback_fn_t
int(* opal_cr_user_inc_callback_fn_t)(opal_cr_user_inc_callback_event_t event, opal_cr_user_inc_callback_state_t state)
User coordination callback routine.
Definition: opal_cr.h:325
OPAL_CHECKPOINT_CMD_ERROR
@ OPAL_CHECKPOINT_CMD_ERROR
Definition: opal_cr.h:56
opal_cr_timing_target_rank
OPAL_DECLSPEC int opal_cr_timing_target_rank
Definition: opal_cr.c:100
OPAL_CR_STATUS_REQUESTED
@ OPAL_CR_STATUS_REQUESTED
Definition: opal_cr.h:59