Skip to content

Commit 20b5e0e

Browse files
btl/ofi: fault tolerance
Signed-off-by: Matthew Whitlock <mwhitlo@sandia.gov>
1 parent 333d8ad commit 20b5e0e

File tree

5 files changed

+91
-55
lines changed

5 files changed

+91
-55
lines changed

opal/mca/btl/ofi/btl_ofi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ struct mca_btl_ofi_module_t {
139139

140140
/** registration cache */
141141
mca_rcache_base_module_t *rcache;
142+
143+
mca_btl_base_module_error_cb_fn_t ofi_error_cb;
142144
};
143145
typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t;
144146

opal/mca/btl/ofi/btl_ofi_context.c

Lines changed: 75 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,56 @@ mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl)
310310
return &btl->contexts[rr_num++ % btl->num_contexts];
311311
}
312312

313+
static void inline complete_op_context(mca_btl_ofi_context_t* context,
314+
void *op_context, int rc)
315+
{
316+
mca_btl_ofi_completion_context_t *c_ctx =
317+
(mca_btl_ofi_completion_context_t*) op_context;
318+
/* We are casting to every type here just for simplicity. */
319+
mca_btl_ofi_base_completion_t *comp =
320+
(mca_btl_ofi_base_completion_t *) c_ctx->comp;
321+
mca_btl_ofi_frag_completion_t *frag_comp =
322+
(mca_btl_ofi_frag_completion_t *) c_ctx->comp;
323+
mca_btl_ofi_rdma_completion_t *rdma_comp
324+
= (mca_btl_ofi_rdma_completion_t *) c_ctx->comp;
325+
326+
switch (comp->type) {
327+
case MCA_BTL_OFI_TYPE_GET:
328+
case MCA_BTL_OFI_TYPE_PUT:
329+
case MCA_BTL_OFI_TYPE_AOP:
330+
case MCA_BTL_OFI_TYPE_AFOP:
331+
case MCA_BTL_OFI_TYPE_CSWAP:
332+
/* call the callback */
333+
if (rdma_comp->cbfunc) {
334+
rdma_comp->cbfunc(comp->btl, comp->endpoint, rdma_comp->local_address,
335+
rdma_comp->local_handle, rdma_comp->cbcontext,
336+
rdma_comp->cbdata, rc);
337+
}
338+
339+
MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t *) comp->btl);
340+
break;
341+
342+
case MCA_BTL_OFI_TYPE_RECV:
343+
mca_btl_ofi_recv_frag((mca_btl_ofi_module_t *) comp->btl,
344+
(mca_btl_ofi_endpoint_t *) comp->endpoint, context,
345+
frag_comp->frag, rc);
346+
break;
347+
348+
case MCA_BTL_OFI_TYPE_SEND:
349+
MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t *) comp->btl);
350+
mca_btl_ofi_frag_complete(frag_comp->frag, rc);
351+
break;
352+
353+
default:
354+
/* catasthrophic */
355+
BTL_ERROR(("unknown completion type"));
356+
MCA_BTL_OFI_ABORT();
357+
}
358+
359+
/* return the completion handler */
360+
opal_free_list_return(comp->my_list, (opal_free_list_item_t *) comp);
361+
}
362+
313363
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context)
314364
{
315365

@@ -319,61 +369,14 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context)
319369
struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE];
320370
struct fi_cq_err_entry cqerr = {0};
321371

322-
mca_btl_ofi_completion_context_t *c_ctx;
323-
mca_btl_ofi_base_completion_t *comp;
324-
mca_btl_ofi_rdma_completion_t *rdma_comp;
325-
mca_btl_ofi_frag_completion_t *frag_comp;
326-
327372
ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
328373

329374
if (0 < ret) {
330375
events_read = ret;
331376
for (int i = 0; i < events_read; i++) {
332377
if (NULL != cq_entry[i].op_context) {
333378
++events;
334-
335-
c_ctx = (mca_btl_ofi_completion_context_t *) cq_entry[i].op_context;
336-
337-
/* We are casting to every type here just for simplicity. */
338-
comp = (mca_btl_ofi_base_completion_t *) c_ctx->comp;
339-
frag_comp = (mca_btl_ofi_frag_completion_t *) c_ctx->comp;
340-
rdma_comp = (mca_btl_ofi_rdma_completion_t *) c_ctx->comp;
341-
342-
switch (comp->type) {
343-
case MCA_BTL_OFI_TYPE_GET:
344-
case MCA_BTL_OFI_TYPE_PUT:
345-
case MCA_BTL_OFI_TYPE_AOP:
346-
case MCA_BTL_OFI_TYPE_AFOP:
347-
case MCA_BTL_OFI_TYPE_CSWAP:
348-
/* call the callback */
349-
if (rdma_comp->cbfunc) {
350-
rdma_comp->cbfunc(comp->btl, comp->endpoint, rdma_comp->local_address,
351-
rdma_comp->local_handle, rdma_comp->cbcontext,
352-
rdma_comp->cbdata, OPAL_SUCCESS);
353-
}
354-
355-
MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t *) comp->btl);
356-
break;
357-
358-
case MCA_BTL_OFI_TYPE_RECV:
359-
mca_btl_ofi_recv_frag((mca_btl_ofi_module_t *) comp->btl,
360-
(mca_btl_ofi_endpoint_t *) comp->endpoint, context,
361-
frag_comp->frag);
362-
break;
363-
364-
case MCA_BTL_OFI_TYPE_SEND:
365-
MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t *) comp->btl);
366-
mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS);
367-
break;
368-
369-
default:
370-
/* catasthrophic */
371-
BTL_ERROR(("unknown completion type"));
372-
MCA_BTL_OFI_ABORT();
373-
}
374-
375-
/* return the completion handler */
376-
opal_free_list_return(comp->my_list, (opal_free_list_item_t *) comp);
379+
complete_op_context(context, cq_entry[i].op_context, OPAL_SUCCESS);
377380
}
378381
}
379382
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
@@ -383,10 +386,31 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context)
383386
if (0 > ret) {
384387
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", __FILE__, __LINE__,
385388
fi_strerror(-ret), ret));
386-
} else {
387-
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", cqerr.prov_errno));
389+
MCA_BTL_OFI_ABORT();
390+
} else if(NULL != cqerr.op_context){
391+
switch(cqerr.err) {
392+
case -FI_EIO: {
393+
mca_btl_ofi_completion_context_t *c_ctx =
394+
(mca_btl_ofi_completion_context_t*) cqerr.op_context;
395+
mca_btl_ofi_base_completion_t *comp =
396+
(mca_btl_ofi_base_completion_t*) c_ctx->comp;
397+
mca_btl_ofi_module_t *ofi_btl =
398+
(mca_btl_ofi_module_t*) comp->btl;
399+
if(ofi_btl->ofi_error_cb){
400+
ofi_btl->ofi_error_cb(comp->btl, 0, comp->endpoint->ep_proc,
401+
"IO error reported by libfabric");
402+
}
403+
404+
++events;
405+
complete_op_context(context, cqerr.op_context, OPAL_ERR_UNREACH);
406+
break;
407+
}
408+
default:
409+
BTL_ERROR(("fi_cq_readerr: %s(%d) (provider err_code = %d)\n",
410+
fi_strerror(-cqerr.err), cqerr.err, cqerr.prov_errno));
411+
MCA_BTL_OFI_ABORT();
412+
}
388413
}
389-
MCA_BTL_OFI_ABORT();
390414
}
391415
#ifdef FI_EINTR
392416
/* sometimes, sockets provider complain about interrupt. We do nothing. */

opal/mca/btl/ofi/btl_ofi_frag.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,9 @@ int mca_btl_ofi_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
145145
}
146146

147147
int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t *endpoint,
148-
mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag)
148+
mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag,
149+
int rc)
149150
{
150-
int rc;
151151
mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + frag->hdr.tag;
152152
mca_btl_base_segment_t segment = {.seg_addr.pval = (void *) (frag + 1),
153153
.seg_len = frag->hdr.len};
@@ -160,7 +160,7 @@ int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t
160160

161161
/* call the callback */
162162
reg->cbfunc(&ofi_btl->super, &recv_desc);
163-
mca_btl_ofi_frag_complete(frag, OPAL_SUCCESS);
163+
mca_btl_ofi_frag_complete(frag, rc);
164164

165165
/* repost the recv */
166166
rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t *) ofi_btl, context, 1);

opal/mca/btl/ofi/btl_ofi_frag.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ int mca_btl_ofi_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
3838
mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag);
3939

4040
int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t *endpoint,
41-
mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag);
41+
mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag,
42+
int rc);
4243

4344
struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src(mca_btl_base_module_t *btl,
4445
mca_btl_base_endpoint_t *endpoint,

opal/mca/btl/ofi/btl_ofi_module.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,14 @@ static int mca_btl_ofi_del_procs(mca_btl_base_module_t *btl, size_t nprocs, opal
143143
return OPAL_SUCCESS;
144144
}
145145

146+
static int mca_btl_ofi_register_error(mca_btl_base_module_t *btl,
147+
mca_btl_base_module_error_cb_fn_t cb)
148+
{
149+
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
150+
ofi_btl->ofi_error_cb = cb;
151+
return OPAL_SUCCESS;
152+
}
153+
146154
void mca_btl_ofi_rcache_init(mca_btl_ofi_module_t *module)
147155
{
148156
if (!module->initialized) {
@@ -515,4 +523,5 @@ mca_btl_ofi_module_t mca_btl_ofi_module_template = {
515523
.btl_add_procs = mca_btl_ofi_add_procs,
516524
.btl_del_procs = mca_btl_ofi_del_procs,
517525
.btl_finalize = mca_btl_ofi_finalize,
526+
.btl_register_error = mca_btl_ofi_register_error,
518527
}};

0 commit comments

Comments
 (0)