diff --git a/ompi/mca/mtl/psm2/help-mtl-psm2.txt b/ompi/mca/mtl/psm2/help-mtl-psm2.txt index ee876efd209..7728e4d7a37 100644 --- a/ompi/mca/mtl/psm2/help-mtl-psm2.txt +++ b/ompi/mca/mtl/psm2/help-mtl-psm2.txt @@ -1,7 +1,7 @@ # -*- text -*- # # Copyright (C) 2009. QLogic Corporation. All rights reserved. -# Copyright (c) 2013-2015 Intel, Inc. All rights reserved. +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -47,5 +47,17 @@ Unknown path record query mechanism %s. Supported mechanisms are %s. Message size %llu bigger than supported by PSM2 API. Max = %llu # [no psm2 cuda env] -Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s. -This is not a recommended combination. If the application uses %s. +Warning: Open MPI has detected that you are running in an environment with CUDA +devices present and that you are using Intel(r) Ompi-Path networking. However, +the environment variable PSM2_CUDA was not set, meaning that the PSM2 Omni-Path +networking library was not told how to handle CUDA support. + +If your application uses CUDA buffers, you should set the environment variable +PSM2_CUDA to 1; otherwise, set it to 0. Setting the variable to the wrong value +can have performance implications on your application, or even cause it to +crash. + +Since it was not set, Open MPI has defaulted to setting the PSM2_CUDA +environment variable to 1. + +Local hostname: %s diff --git a/ompi/mca/mtl/psm2/mtl_psm2_component.c b/ompi/mca/mtl/psm2/mtl_psm2_component.c index 2c183d48eac..b50b4452ef6 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2_component.c +++ b/ompi/mca/mtl/psm2/mtl_psm2_component.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved. * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -28,6 +28,7 @@ #include "opal/mca/event/event.h" #include "opal/util/output.h" #include "opal/util/show_help.h" +#include "opal/util/opal_environ.h" #include "ompi/proc/proc.h" #include "mtl_psm2.h" @@ -45,6 +46,10 @@ static int param_priority; /* MPI_THREAD_MULTIPLE_SUPPORT */ opal_mutex_t mtl_psm2_mq_mutex = OPAL_MUTEX_STATIC_INIT; +#if OPAL_CUDA_SUPPORT +static bool cuda_envvar_set = false; +#endif + static int ompi_mtl_psm2_component_open(void); static int ompi_mtl_psm2_component_close(void); static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority); @@ -201,9 +206,6 @@ static int ompi_mtl_psm2_component_register(void) { int num_local_procs, num_total_procs; -#if OPAL_CUDA_SUPPORT - char *cuda_env; -#endif ompi_mtl_psm2.connect_timeout = 180; (void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version, @@ -228,30 +230,6 @@ ompi_mtl_psm2_component_register(void) param_priority = 40; } -#if OPAL_CUDA_SUPPORT - /* - * If using CUDA enabled OpenMPI, the user likely intends to - * run with CUDA buffers. So, force-set the envvar here if user failed - * to set it. - */ - cuda_env = getenv("PSM2_CUDA"); - if (!cuda_env) { - opal_show_help("help-mtl-psm2.txt", - "no psm2 cuda env", true, - "not set", - "Host buffers,\nthere will be a performance penalty" - " due to OMPI force setting this variable now.\n" - "Set environment variable to 0 if using Host buffers" ); - setenv("PSM2_CUDA", "1", 0); - } else if (strcmp(cuda_env, "0") == 0) { - opal_show_help("help-mtl-psm2.txt", - "no psm2 cuda env", true, - "set to 0", - "CUDA buffers,\nthe execution will SEGFAULT." - " Set environment variable to 1 if using CUDA buffers"); - } -#endif - (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version, "priority", "Priority of the PSM2 MTL component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -272,17 +250,16 @@ static int ompi_mtl_psm2_component_open(void) { int res; - glob_t globbuf; - globbuf.gl_offs = 0; + glob_t globbuf = {0}; /* Component available only if Omni-Path hardware is present */ res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf); - if (0 == res || GLOB_NOMATCH == res) { + if (globbuf.gl_pathc > 0) { globfree(&globbuf); } if (0 != res) { res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf); - if (0 == res || GLOB_NOMATCH == res) { + if (globbuf.gl_pathc > 0) { globfree(&globbuf); } if (0 != res) { @@ -336,6 +313,11 @@ ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority) static int ompi_mtl_psm2_component_close(void) { +#if OPAL_CUDA_SUPPORT + if (cuda_envvar_set) { + opal_unsetenv("PSM2_CUDA", &environ); + } +#endif return OMPI_SUCCESS; } @@ -362,6 +344,11 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads, int verno_major = PSM2_VERNO_MAJOR; int verno_minor = PSM2_VERNO_MINOR; int local_rank = -1, num_local_procs = 0; +#if OPAL_CUDA_SUPPORT + int ret; + char *cuda_env; + glob_t globbuf = {0}; +#endif /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM2 with these values so it can @@ -389,6 +376,27 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads, ompi_mtl_psm2_set_shadow_env (ompi_mtl_psm2_shadow_variables + i); } +#if OPAL_CUDA_SUPPORT + /* + * If using CUDA enabled Open MPI, the user likely intends to + * run with CUDA buffers. So, force-set the envvar here if user failed + * to set it. + */ + ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf); + if (globbuf.gl_pathc > 0) { + globfree(&globbuf); + } + + cuda_env = getenv("PSM2_CUDA"); + if (!cuda_env && (0 == ret)) { + opal_show_help("help-mtl-psm2.txt", + "no psm2 cuda env", true, + ompi_process_info.nodename); + opal_setenv("PSM2_CUDA", "1", false, &environ); + cuda_envvar_set = true; + } +#endif + err = psm2_init(&verno_major, &verno_minor); if (err) { opal_show_help("help-mtl-psm2.txt",