From aaf00e97ae0ed2ab95913876837ff839764089a3 Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Mon, 21 Mar 2022 11:40:25 -0600 Subject: [PATCH 1/8] Consolidate duplicate code _pick_step_cores() Bug 13444 --- src/slurmctld/step_mgr.c | 138 +++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 77 deletions(-) diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 7fdc115cab..c91f5061ff 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -1882,23 +1882,65 @@ static bool _pick_step_core(step_record_t *step_ptr, } static bool _handle_core_select(step_record_t *step_ptr, - job_resources_t *job_resrcs_ptr, - int job_node_inx, - int sock_inx, int core_inx, bool use_all_cores, - bool oversubscribing_cpus, int *cpu_cnt) + job_resources_t *job_resrcs_ptr, + int job_node_inx, uint16_t sockets, + uint16_t cores, bool use_all_cores, + bool oversubscribing_cpus, int *cpu_cnt) { + int core_inx, i, sock_inx; + static int last_core_inx; + xassert(cpu_cnt); - if (!_pick_step_core(step_ptr, - job_resrcs_ptr, - job_node_inx, sock_inx, - core_inx, use_all_cores, - oversubscribing_cpus)) - return false; + /* + * Use last_core_inx to avoid putting all of the extra + * work onto core zero when oversubscribing cpus. + */ + if (oversubscribing_cpus) + last_core_inx = (last_core_inx + 1) % cores; - if (--(*cpu_cnt) == 0) - return true; + /* + * Figure out the task distribution. The default is to cyclically + * distribute to sockets. + */ + if (step_ptr->step_layout && + (step_ptr->step_layout->task_dist & SLURM_DIST_SOCKBLOCK)) { + /* Fill sockets before allocating to the next socket */ + for (sock_inx=0; sock_inx < sockets; sock_inx++) { + for (i=0; i < cores; i++) { + if (oversubscribing_cpus) + core_inx = (last_core_inx + i) % cores; + else + core_inx = i; + + if (!_pick_step_core(step_ptr, job_resrcs_ptr, + job_node_inx, sock_inx, + core_inx, use_all_cores, + oversubscribing_cpus)) + continue; + + if (--(*cpu_cnt) == 0) + return true; + } + } + } else { + for (i = 0; i < cores; i++) { + if (oversubscribing_cpus) + core_inx = (last_core_inx + i) % cores; + else + core_inx = i; + for (sock_inx = 0; sock_inx < sockets; sock_inx++) { + if (!_pick_step_core(step_ptr, job_resrcs_ptr, + job_node_inx, sock_inx, + core_inx, use_all_cores, + oversubscribing_cpus)) + continue; + if (--(*cpu_cnt) == 0) + return true; + } + } + } return false; } @@ -1947,38 +1989,9 @@ static int _pick_step_cores(step_record_t *step_ptr, } /* select idle cores first */ - /* - * Figure out the task distribution. The default is to cyclically - * distribute to sockets. - */ - if (step_ptr->step_layout && - (step_ptr->step_layout->task_dist & SLURM_DIST_SOCKBLOCK)) { - /* Fill sockets before allocating to the next socket */ - for (sock_inx=0; sock_inx < sockets; sock_inx++) { - for (core_inx=0; core_inx < cores; core_inx++) { - if (_handle_core_select(step_ptr, - job_resrcs_ptr, - job_node_inx, sock_inx, - core_inx, - use_all_cores, - false, &cpu_cnt)) - return SLURM_SUCCESS; - } - } - } else { - /* Cyclically allocate cores across sockets */ - for (core_inx=0; core_inx < cores; core_inx++) { - for (sock_inx=0; sock_inx < sockets; sock_inx++) { - if (_handle_core_select(step_ptr, - job_resrcs_ptr, - job_node_inx, sock_inx, - core_inx, - use_all_cores, - false, &cpu_cnt)) - return SLURM_SUCCESS; - } - } - } + if (_handle_core_select(step_ptr, job_resrcs_ptr, job_node_inx, sockets, + cores, use_all_cores, false, &cpu_cnt)) + return SLURM_SUCCESS; /* The test for cores==0 is just to avoid CLANG errors. * It should never happen */ @@ -1988,46 +2001,17 @@ static int _pick_step_cores(step_record_t *step_ptr, if (!(step_ptr->flags & SSF_OVERCOMMIT)) return ESLURM_NODES_BUSY; - /* We need to over-subscribe one or more cores. - * Use last_core_inx to avoid putting all of the extra - * work onto core zero */ + /* We need to over-subscribe one or more cores. */ verbose("%s: %pS needs to over-subscribe cores required:%u assigned:%u/%"PRIu64 " overcommit:%c exclusive:%c", __func__, step_ptr, cores, bit_set_count(job_resrcs_ptr->core_bitmap), bit_size(job_resrcs_ptr->core_bitmap), ((step_ptr->flags & SSF_OVERCOMMIT) ? 'T' : 'F'), ((step_ptr->flags & SSF_EXCLUSIVE) ? 'T' : 'F')); - last_core_inx = (last_core_inx + 1) % cores; - if (step_ptr->step_layout && - (step_ptr->step_layout->task_dist & SLURM_DIST_SOCKBLOCK)) { - /* Fill sockets before allocating to the next socket */ - for (sock_inx=0; sock_inx < sockets; sock_inx++) { - for (i=0; i < cores; i++) { - core_inx = (last_core_inx + i) % cores; - if (_handle_core_select(step_ptr, - job_resrcs_ptr, - job_node_inx, sock_inx, - core_inx, - use_all_cores, - true, &cpu_cnt)) - return SLURM_SUCCESS; - } - } - } else { - for (i=0; i < cores; i++) { - core_inx = (last_core_inx + i) % cores; - for (sock_inx=0; sock_inx < sockets; sock_inx++) { - if (_handle_core_select(step_ptr, - job_resrcs_ptr, - job_node_inx, sock_inx, - core_inx, - use_all_cores, - true, &cpu_cnt)) - return SLURM_SUCCESS; - } - } - } + if (_handle_core_select(step_ptr, job_resrcs_ptr, job_node_inx, sockets, + cores, use_all_cores, true, &cpu_cnt)) + return SLURM_SUCCESS; return SLURM_SUCCESS; } -- 2.32.0 From f7b33622f688415a5e843b414035f049e24d7b8c Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Mon, 21 Mar 2022 12:01:03 -0600 Subject: [PATCH 2/8] Pick step cores based on gres topology. Bug 13444 --- NEWS | 1 + src/slurmctld/step_mgr.c | 163 +++++++++++++++++++++++++++++++++++---- 2 files changed, 149 insertions(+), 15 deletions(-) diff --git a/NEWS b/NEWS index 16099d2dee..d81031b22c 100644 --- a/NEWS +++ b/NEWS @@ -154,6 +154,7 @@ documents those changes that are of interest to users and administrators. -- openapi/dbv0.0.38 - disable automatic lookup of all HetJob components on specific job lookups. -- Add support for hourly reoccurring reservations. + -- Steps now allocate gres according to topo data. * Changes in Slurm 21.08.7 ========================== diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index c91f5061ff..a72a54f518 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -1838,9 +1838,110 @@ static int _count_cpus(job_record_t *job_ptr, bitstr_t *bitmap, return sum; } +static bool _filter_cores(gres_state_t *gres_state_step, + gres_state_t *gres_state_node, + bitstr_t *avail_core_bitmap, int core_start_bit, + int job_node_inx) +{ + int i, j, core_ctld; + gres_node_state_t *gres_ns = gres_state_node->gres_data; + gres_step_state_t *gres_ss = gres_state_step->gres_data; + + if (!gres_ns->topo_cnt) /* No topology info */ + return false; + + /* Determine which specific cores can be used */ + for (i = 0; i < gres_ns->topo_cnt; i++) { + if (!bit_overlap_any(gres_ss->gres_bit_alloc[job_node_inx], + gres_ns->topo_gres_bitmap[i])) + continue; + if (!gres_ns->topo_core_bitmap[i]) { + FREE_NULL_BITMAP(avail_core_bitmap); /* No filter */ + return false; + } + core_ctld = bit_size(gres_ns->topo_core_bitmap[i]); + for (j = 0; j < core_ctld; j++) { + if (bit_test(gres_ns->topo_core_bitmap[i], j)) { + bit_set(avail_core_bitmap, core_start_bit + j); + } + } + } + return true; +} + +static void _set_full_sockets(bitstr_t *avail_core_bitmap, + job_resources_t *job_resrcs_ptr, int job_node_inx, + int sockets, int cores) +{ + int sock_inx, start, end; + for (sock_inx = 0; sock_inx < sockets; sock_inx++) { + start = get_job_resources_offset(job_resrcs_ptr, job_node_inx, + sock_inx, 0); + end = get_job_resources_offset(job_resrcs_ptr, job_node_inx, + sock_inx, cores - 1); + if (!bit_set_count_range(avail_core_bitmap, start, end)) + continue; + bit_nset(avail_core_bitmap, start, end); + } +} + +/* + * Clear the core_bitmap for cores which are not bound to the allocated gres + * IN step_gres_list - steps's gres_list_alloc + * IN node_gres_list - node's gres_list built by gres_node_config_validate() + * IN/OUT core_bitmap - Identification of available cores + * IN core_start_bit - index into core_bitmap for this node's first cores + * IN core_end_bit - index into core_bitmap for this node's last cores + */ +static void _gres_filter_avail_sockets(List step_gres_list, + job_resources_t *job_resrcs_ptr, + bitstr_t *core_bitmap, int node_inx, + int job_node_inx, int sockets, int cores) +{ + ListIterator step_gres_iter; + gres_state_t *step_gres_state, *gres_state_node; + bitstr_t *avail_core_bitmap; + List node_gres_list = node_record_table_ptr[node_inx].gres_list; + int core_start_bit = cr_get_coremap_offset(node_inx); + int core_end_bit = cr_get_coremap_offset(node_inx + 1) - 1; + + if ((step_gres_list == NULL) || (core_bitmap == NULL)) + return; + if (node_gres_list == NULL) { + bit_nclear(core_bitmap, core_start_bit, core_end_bit); + return; + } + + step_gres_iter = list_iterator_create(step_gres_list); + while ((step_gres_state = (gres_state_t *) list_next(step_gres_iter))) { + gres_state_node = list_find_first(node_gres_list, gres_find_id, + &step_gres_state->plugin_id); + if (gres_state_node == NULL) { + /* node lack resources required by the job */ + bit_nclear(core_bitmap, core_start_bit, core_end_bit); + break; + } + + avail_core_bitmap = bit_copy(core_bitmap); + bit_nclear(avail_core_bitmap, core_start_bit, core_end_bit); + if (_filter_cores(step_gres_state, gres_state_node, + avail_core_bitmap, core_start_bit, + job_node_inx)) { + _set_full_sockets(avail_core_bitmap, job_resrcs_ptr, + job_node_inx, sockets, cores); + bit_and(core_bitmap, avail_core_bitmap); + } + FREE_NULL_BITMAP(avail_core_bitmap); + } + list_iterator_destroy(step_gres_iter); + + return; +} + /* Return true if a core was picked, false if not */ static bool _pick_step_core(step_record_t *step_ptr, - job_resources_t *job_resrcs_ptr, int job_node_inx, + job_resources_t *job_resrcs_ptr, + bitstr_t *avail_core_bitmap, int job_node_inx, int sock_inx, int core_inx, bool use_all_cores, bool oversubscribing_cpus) { @@ -1853,7 +1954,7 @@ static bool _pick_step_core(step_record_t *step_ptr, if (bit_offset < 0) fatal("get_job_resources_offset"); - if (!bit_test(job_resrcs_ptr->core_bitmap, bit_offset)) + if (!bit_test(avail_core_bitmap, bit_offset)) return false; if (oversubscribing_cpus) { @@ -1883,6 +1984,7 @@ static bool _pick_step_core(step_record_t *step_ptr, static bool _handle_core_select(step_record_t *step_ptr, job_resources_t *job_resrcs_ptr, + bitstr_t *avail_core_bitmap, int job_node_inx, uint16_t sockets, uint16_t cores, bool use_all_cores, bool oversubscribing_cpus, int *cpu_cnt) @@ -1914,6 +2016,7 @@ static bool _handle_core_select(step_record_t *step_ptr, core_inx = i; if (!_pick_step_core(step_ptr, job_resrcs_ptr, + avail_core_bitmap, job_node_inx, sock_inx, core_inx, use_all_cores, oversubscribing_cpus)) @@ -1931,6 +2034,7 @@ static bool _handle_core_select(step_record_t *step_ptr, core_inx = i; for (sock_inx = 0; sock_inx < sockets; sock_inx++) { if (!_pick_step_core(step_ptr, job_resrcs_ptr, + avail_core_bitmap, job_node_inx, sock_inx, core_inx, use_all_cores, oversubscribing_cpus)) @@ -1949,13 +2053,13 @@ static bool _handle_core_select(step_record_t *step_ptr, * and step's allocation */ static int _pick_step_cores(step_record_t *step_ptr, job_resources_t *job_resrcs_ptr, int job_node_inx, - uint16_t task_cnt, uint16_t cpus_per_core) + uint16_t task_cnt, uint16_t cpus_per_core, + int node_inx) { - int core_inx, i, sock_inx; uint16_t sockets, cores; int cpu_cnt = (int) task_cnt; bool use_all_cores; - static int last_core_inx; + bitstr_t *avail_core_bitmap = NULL; if (!step_ptr->core_bitmap_job) step_ptr->core_bitmap_job = @@ -1988,18 +2092,35 @@ static int _pick_step_cores(step_record_t *step_ptr, } - /* select idle cores first */ - if (_handle_core_select(step_ptr, job_resrcs_ptr, job_node_inx, sockets, - cores, use_all_cores, false, &cpu_cnt)) - return SLURM_SUCCESS; + avail_core_bitmap = bit_copy(job_resrcs_ptr->core_bitmap); + _gres_filter_avail_sockets(step_ptr->gres_list_alloc, + job_resrcs_ptr, + avail_core_bitmap, node_inx, + job_node_inx, sockets, cores); + + /* select idle cores that fit gres binding first */ + if (_handle_core_select(step_ptr, job_resrcs_ptr, + avail_core_bitmap, job_node_inx, + sockets, cores, use_all_cores, false, &cpu_cnt)) + goto cleanup; + + /* select all idle cores */ + if (_handle_core_select(step_ptr, job_resrcs_ptr, + job_resrcs_ptr->core_bitmap, job_node_inx, + sockets, cores, use_all_cores, false, &cpu_cnt)) + goto cleanup; + /* The test for cores==0 is just to avoid CLANG errors. * It should never happen */ if (use_all_cores || (cores == 0)) - return SLURM_SUCCESS; + goto cleanup; + - if (!(step_ptr->flags & SSF_OVERCOMMIT)) + if (!(step_ptr->flags & SSF_OVERCOMMIT)) { + FREE_NULL_BITMAP(avail_core_bitmap); return ESLURM_NODES_BUSY; + } /* We need to over-subscribe one or more cores. */ verbose("%s: %pS needs to over-subscribe cores required:%u assigned:%u/%"PRIu64 " overcommit:%c exclusive:%c", @@ -2009,10 +2130,22 @@ static int _pick_step_cores(step_record_t *step_ptr, ((step_ptr->flags & SSF_OVERCOMMIT) ? 'T' : 'F'), ((step_ptr->flags & SSF_EXCLUSIVE) ? 'T' : 'F')); - if (_handle_core_select(step_ptr, job_resrcs_ptr, job_node_inx, sockets, - cores, use_all_cores, true, &cpu_cnt)) - return SLURM_SUCCESS; + /* oversubscribe cores that fit gres binding first */ + if (_handle_core_select(step_ptr, job_resrcs_ptr, + avail_core_bitmap, job_node_inx, + sockets, cores, use_all_cores, true, &cpu_cnt)) + goto cleanup; + + /* oversubscribe all cores */ + if (_handle_core_select(step_ptr, job_resrcs_ptr, + job_resrcs_ptr->core_bitmap, job_node_inx, + sockets, cores, use_all_cores, true, &cpu_cnt)) + goto cleanup; + + +cleanup: + FREE_NULL_BITMAP(avail_core_bitmap); return SLURM_SUCCESS; } @@ -2279,7 +2412,7 @@ static int _step_alloc_lps(step_record_t *step_ptr) job_node_inx, step_ptr->step_layout-> tasks[step_node_inx], - cpus_per_core))) { + cpus_per_core, i_node))) { log_flag(STEPS, "unable to pick step cores for job node %d (%s): %s", job_node_inx, node_record_table_ptr[i_node].name, -- 2.32.0 From ff5f16c7502e03740509631c59e3062332274955 Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Wed, 23 Mar 2022 12:04:20 -0600 Subject: [PATCH 3/8] Don't do unnecessary calculations Bug 13444 --- src/slurmctld/step_mgr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index a72a54f518..e09b99fe41 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -2105,7 +2105,8 @@ static int _pick_step_cores(step_record_t *step_ptr, goto cleanup; /* select all idle cores */ - if (_handle_core_select(step_ptr, job_resrcs_ptr, + if (!bit_equal(avail_core_bitmap, job_resrcs_ptr->core_bitmap) && + _handle_core_select(step_ptr, job_resrcs_ptr, job_resrcs_ptr->core_bitmap, job_node_inx, sockets, cores, use_all_cores, false, &cpu_cnt)) goto cleanup; @@ -2138,7 +2139,8 @@ static int _pick_step_cores(step_record_t *step_ptr, /* oversubscribe all cores */ - if (_handle_core_select(step_ptr, job_resrcs_ptr, + if (!bit_equal(avail_core_bitmap, job_resrcs_ptr->core_bitmap) && + _handle_core_select(step_ptr, job_resrcs_ptr, job_resrcs_ptr->core_bitmap, job_node_inx, sockets, cores, use_all_cores, true, &cpu_cnt)) goto cleanup; -- 2.32.0 From d262661739b70116bc0897d31c01aa81687b6d0d Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Wed, 23 Mar 2022 14:30:43 -0600 Subject: [PATCH 4/8] Make --gres-flags=enforce-binding work with steps Bugb 13444 --- doc/man/man1/srun.1 | 19 ++++++++++++++++--- slurm/slurm.h | 2 ++ src/common/slurm_opt.c | 3 +++ src/common/slurm_opt.h | 1 + src/slurmctld/step_mgr.c | 9 +++++++-- src/srun/libsrun/launch.c | 4 ++++ 6 files changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 2196e41f0b..1a0e51ffd7 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1432,7 +1432,6 @@ to "none". .TP \fB\-\-gres\-flags\fR=<\fItype\fR> Specify generic resource task binding options. -This option applies to job allocations. .IP .RS .TP @@ -1443,13 +1442,14 @@ This option is currently required to use more CPUs than are bound to a GRES one socket are required to run the job). This option may permit a job to be allocated resources sooner than otherwise possible, but may result in lower job performance. +This option applies to job allocations. .br \fBNOTE\fR: This option is specific to \fBSelectType=cons_res\fR. .IP .TP .B enforce\-binding -The only CPUs available to the job will be those bound to the selected +The only CPUs available to the job/step will be those bound to the selected GRES (i.e. the CPUs identified in the gres.conf file will be strictly enforced). This option may result in delayed initiation of a job. For example a job requiring two GPUs and one CPU will be delayed until both @@ -1458,8 +1458,21 @@ sockets, however, the application performance may be improved due to improved communication speed. Requires the node to be configured with more than one socket and resource filtering will be performed on a per\-socket basis. +\fBNOTE\fR: This only applies to job steps that use \fB--exact\fR. As, by +default, a job step will have access to all cpus in the job on node. +.br +\fBNOTE\fR: If set for a job, all child job steps will have enforce\-binding set +unless \fB\-\-gres\-flags=none\fR is set. .br -\fBNOTE\fR: This option is specific to \fBSelectType=cons_tres\fR. +\fBNOTE\fR: This option is specific to \fBSelectType=cons_tres\fR for job +allocations. +.RE +.IP + +.TP +.B none +Used to prevent job steps from inheriting enforce\-binding from the job. +This option applies to job steps. .RE .IP diff --git a/slurm/slurm.h b/slurm/slurm.h index 1e7eacbc21..4d82ba6679 100644 --- a/slurm/slurm.h +++ b/slurm/slurm.h @@ -1155,6 +1155,8 @@ typedef enum { * steps; resources allocated to this step * are not decremented from the job's * allocation */ + SSF_GRES_ENFORCE_BIND = 1 << 7, /* Enforce CPU/GRES binding */ + SSF_GRES_NO_ENFORCE_BIND = 1 << 8, /* Don't Enforce CPU/GRES binding */ } step_spec_flags_t; /*****************************************************************************\ diff --git a/src/common/slurm_opt.c b/src/common/slurm_opt.c index 5948d6db6a..0157ab71e1 100644 --- a/src/common/slurm_opt.c +++ b/src/common/slurm_opt.c @@ -2132,6 +2132,9 @@ static int arg_set_gres_flags(slurm_opt_t *opt, const char *arg) opt->job_flags |= GRES_DISABLE_BIND; } else if (!xstrcasecmp(arg, "enforce-binding")) { opt->job_flags |= GRES_ENFORCE_BIND; + } else if (!xstrcasecmp(arg, "none") && + opt->srun_opt) { + opt->srun_opt->gres_flags_none = true; } else { error("Invalid --gres-flags specification"); return SLURM_ERROR; diff --git a/src/common/slurm_opt.h b/src/common/slurm_opt.h index 184682c7cf..19fe1a2ce7 100644 --- a/src/common/slurm_opt.h +++ b/src/common/slurm_opt.h @@ -251,6 +251,7 @@ typedef struct { char *epilog; /* --epilog */ bool exact; /* --exact */ bool exclusive; /* --exclusive */ + bool gres_flags_none; /* --gres-flags=none */ bool interactive; /* --interactive */ uint32_t jobid; /* --jobid */ int32_t kill_bad_exit; /* --kill-on-bad-exit */ diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index e09b99fe41..ae19e8835d 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -2105,7 +2105,8 @@ static int _pick_step_cores(step_record_t *step_ptr, goto cleanup; /* select all idle cores */ - if (!bit_equal(avail_core_bitmap, job_resrcs_ptr->core_bitmap) && + if (!(step_ptr->flags & SSF_GRES_ENFORCE_BIND) && + !bit_equal(avail_core_bitmap, job_resrcs_ptr->core_bitmap) && _handle_core_select(step_ptr, job_resrcs_ptr, job_resrcs_ptr->core_bitmap, job_node_inx, sockets, cores, use_all_cores, false, &cpu_cnt)) @@ -2139,7 +2140,8 @@ static int _pick_step_cores(step_record_t *step_ptr, /* oversubscribe all cores */ - if (!bit_equal(avail_core_bitmap, job_resrcs_ptr->core_bitmap) && + if (!(step_ptr->flags & SSF_GRES_ENFORCE_BIND) && + !bit_equal(avail_core_bitmap, job_resrcs_ptr->core_bitmap) && _handle_core_select(step_ptr, job_resrcs_ptr, job_resrcs_ptr->core_bitmap, job_node_inx, sockets, cores, use_all_cores, true, &cpu_cnt)) @@ -3206,6 +3208,9 @@ extern int step_create(job_step_create_request_msg_t *step_specs, step_ptr->cpu_count = orig_cpu_count; step_ptr->exit_code = NO_VAL; step_ptr->flags = step_specs->flags; + if (step_ptr->job_ptr->bit_flags & GRES_ENFORCE_BIND && + !(step_ptr->flags & SSF_GRES_NO_ENFORCE_BIND)) + step_ptr->flags |= SSF_GRES_ENFORCE_BIND; step_ptr->ext_sensors = ext_sensors_alloc(); step_ptr->cpus_per_tres = xstrdup(step_specs->cpus_per_tres); diff --git a/src/srun/libsrun/launch.c b/src/srun/libsrun/launch.c index 182db1a184..245e2dbaa5 100644 --- a/src/srun/libsrun/launch.c +++ b/src/srun/libsrun/launch.c @@ -193,6 +193,10 @@ static job_step_create_request_msg_t *_create_job_step_create_request( debug("interactive step launch request"); step_req->flags |= SSF_INTERACTIVE; } + if (opt_local->job_flags & GRES_ENFORCE_BIND) + step_req->flags |= SSF_GRES_ENFORCE_BIND; + if (srun_opt->gres_flags_none) + step_req->flags |= SSF_GRES_NO_ENFORCE_BIND; if (opt_local->immediate == 1) step_req->immediate = opt_local->immediate; -- 2.32.0 From 85bce5ce45d0f5eaad06d0f16c6c56f0aa7f93b0 Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Mon, 14 Mar 2022 10:24:02 -0600 Subject: [PATCH 5/8] Add core_bitmap_job to _step_alloc() Bug 13444 --- src/slurmctld/gres_ctld.c | 11 ++++++++--- src/slurmctld/gres_ctld.h | 3 ++- src/slurmctld/step_mgr.c | 9 +++++++-- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/slurmctld/gres_ctld.c b/src/slurmctld/gres_ctld.c index ad3595fc70..00a6e0ab38 100644 --- a/src/slurmctld/gres_ctld.c +++ b/src/slurmctld/gres_ctld.c @@ -39,6 +39,7 @@ #include "src/common/xstring.h" typedef struct { + bitstr_t *core_bitmap; bool decr_job_alloc; uint64_t gres_needed; gres_key_t *job_search_key; @@ -1974,7 +1975,8 @@ static int _step_alloc(gres_step_state_t *gres_ss, slurm_step_id_t *step_id, uint64_t *gres_needed, uint64_t *max_gres, bool decr_job_alloc, - uint64_t *step_node_mem_alloc) + uint64_t *step_node_mem_alloc, + bitstr_t *core_bitmap) { gres_job_state_t *gres_js = gres_state_job->gres_data; gres_step_state_t *gres_ss_req = gres_state_step_req->gres_data; @@ -2189,7 +2191,8 @@ static int _step_alloc_type(gres_state_t *gres_state_job, args->node_offset, &args->tmp_step_id, &args->gres_needed, &args->max_gres, args->decr_job_alloc, - args->step_node_mem_alloc); + args->step_node_mem_alloc, + args->core_bitmap); if (args->rc != SLURM_SUCCESS) { return -1; @@ -2221,7 +2224,8 @@ extern int gres_ctld_step_alloc(List step_gres_list, uint16_t tasks_on_node, uint32_t rem_nodes, uint32_t job_id, uint32_t step_id, bool decr_job_alloc, - uint64_t *step_node_mem_alloc) + uint64_t *step_node_mem_alloc, + bitstr_t *core_bitmap) { int rc = SLURM_SUCCESS; ListIterator step_gres_iter; @@ -2260,6 +2264,7 @@ extern int gres_ctld_step_alloc(List step_gres_list, job_search_key.type_id = NO_VAL; job_search_key.node_offset = node_offset; + args.core_bitmap = core_bitmap; args.decr_job_alloc = decr_job_alloc; args.gres_needed = _step_get_gres_needed( gres_ss, first_step_node, tasks_on_node, diff --git a/src/slurmctld/gres_ctld.h b/src/slurmctld/gres_ctld.h index 5fa9a6d164..bdc871da6b 100644 --- a/src/slurmctld/gres_ctld.h +++ b/src/slurmctld/gres_ctld.h @@ -199,7 +199,8 @@ extern int gres_ctld_step_alloc(List step_gres_list, uint16_t tasks_on_node, uint32_t rem_nodes, uint32_t job_id, uint32_t step_id, bool decr_job_alloc, - uint64_t *step_node_mem_alloc); + uint64_t *step_node_mem_alloc, + bitstr_t *core_bitmap); /* * Deallocate resource to a step and update job and step gres information diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index ae19e8835d..b42baf2e96 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -2185,6 +2185,7 @@ static int _step_alloc_lps(step_record_t *step_ptr) int rc = SLURM_SUCCESS; uint16_t req_tpc = NO_VAL16; uint16_t *cpus_used; + bitstr_t *unused_core_bitmap; xassert(job_resrcs_ptr); xassert(job_resrcs_ptr->cpus); @@ -2337,7 +2338,9 @@ static int _step_alloc_lps(step_record_t *step_ptr) */ if (!(step_ptr->flags & SSF_OVERLAP_FORCE)) cpus_used[job_node_inx] += cpus_alloc; - + unused_core_bitmap = bit_copy(job_resrcs_ptr->core_bitmap); + bit_and_not(unused_core_bitmap, + job_resrcs_ptr->core_bitmap_used); gres_ctld_step_alloc(step_ptr->gres_list_req, &step_ptr->gres_list_alloc, job_ptr->gres_list_alloc, @@ -2347,7 +2350,9 @@ static int _step_alloc_lps(step_record_t *step_ptr) rem_nodes, job_ptr->job_id, step_ptr->step_id.step_id, !(step_ptr->flags & SSF_OVERLAP_FORCE), - &gres_step_node_mem_alloc); + &gres_step_node_mem_alloc, + unused_core_bitmap); + FREE_NULL_BITMAP(unused_core_bitmap); first_step_node = false; rem_nodes--; if (!step_ptr->pn_min_memory && !gres_step_node_mem_alloc) { -- 2.32.0 From 3c00fc98ffe6a431d895fc944b224a3581b0a7ea Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Tue, 22 Mar 2022 11:21:04 -0600 Subject: [PATCH 6/8] Move code into separate function _set_step_gres_bit_alloc() Bug 13444 --- src/slurmctld/gres_ctld.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/slurmctld/gres_ctld.c b/src/slurmctld/gres_ctld.c index 00a6e0ab38..9ba99a4260 100644 --- a/src/slurmctld/gres_ctld.c +++ b/src/slurmctld/gres_ctld.c @@ -1968,6 +1968,20 @@ static uint64_t _step_get_gres_avail(gres_state_t *gres_state_job, return gres_avail; } +static void _set_step_gres_bit_alloc(bitstr_t *gres_bit_alloc, + int len, uint64_t *gres_alloc) +{ + int i; + for (i = 0; i < len; i++) { + if (*gres_alloc > 0) { + if (bit_test(gres_bit_alloc, i)) + *gres_alloc--; + } else { + bit_clear(gres_bit_alloc, i); + } + } +} + static int _step_alloc(gres_step_state_t *gres_ss, gres_state_t *gres_state_step_req, gres_state_t *gres_state_job, @@ -2086,14 +2100,7 @@ static int _step_alloc(gres_step_state_t *gres_ss, bit_and_not(gres_bit_alloc, gres_js->gres_bit_step_alloc[node_offset]); } - for (i = 0; i < len; i++) { - if (gres_alloc > 0) { - if (bit_test(gres_bit_alloc, i)) - gres_alloc--; - } else { - bit_clear(gres_bit_alloc, i); - } - } + _set_step_gres_bit_alloc(gres_bit_alloc, len, &gres_alloc); } if (gres_alloc) { error("gres/%s: %s %ps oversubscribed resources on node %d", -- 2.32.0 From af14d766394c6da91709f966aa7d57621bb13c56 Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Mon, 14 Mar 2022 11:30:19 -0600 Subject: [PATCH 7/8] Invert from clearing unused gres to setting used gres. Bug 13444 --- src/slurmctld/gres_ctld.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/slurmctld/gres_ctld.c b/src/slurmctld/gres_ctld.c index 9ba99a4260..27bb196956 100644 --- a/src/slurmctld/gres_ctld.c +++ b/src/slurmctld/gres_ctld.c @@ -1969,16 +1969,15 @@ static uint64_t _step_get_gres_avail(gres_state_t *gres_state_job, } static void _set_step_gres_bit_alloc(bitstr_t *gres_bit_alloc, + bitstr_t *gres_bit_avail, int len, uint64_t *gres_alloc) { int i; - for (i = 0; i < len; i++) { - if (*gres_alloc > 0) { - if (bit_test(gres_bit_alloc, i)) - *gres_alloc--; - } else { - bit_clear(gres_bit_alloc, i); - } + for (i = 0; i < len && *gres_alloc; i++) { + if (!bit_test(gres_bit_avail, i)) + continue; + bit_set(gres_bit_alloc, i); + (*gres_alloc)--; } } @@ -1995,7 +1994,7 @@ static int _step_alloc(gres_step_state_t *gres_ss, gres_job_state_t *gres_js = gres_state_job->gres_data; gres_step_state_t *gres_ss_req = gres_state_step_req->gres_data; uint64_t gres_alloc; - bitstr_t *gres_bit_alloc; + bitstr_t *gres_bit_alloc, *gres_bit_avail; int i, len; xassert(gres_js); @@ -2082,25 +2081,26 @@ static int _step_alloc(gres_step_state_t *gres_ss, return SLURM_SUCCESS; } - gres_bit_alloc = bit_copy(gres_js->gres_bit_alloc[node_offset]); - len = bit_size(gres_bit_alloc); + len = bit_size(gres_js->gres_bit_alloc[node_offset]); + gres_bit_alloc = bit_alloc(len); if (gres_id_shared(gres_state_job->config_flags)) { - for (i = 0; i < len; i++) { - if (gres_alloc > 0) { - if (bit_test(gres_bit_alloc, i)) - gres_alloc = 0; - } else { - bit_clear(gres_bit_alloc, i); - } + for (i = 0; i < len && gres_alloc; i++) { + if (!bit_test(gres_js->gres_bit_alloc[node_offset], i)) + continue; + bit_set(gres_bit_alloc, i); + gres_alloc = 0; } } else { + gres_bit_avail = bit_copy(gres_js->gres_bit_alloc[node_offset]); if (decr_job_alloc && gres_js->gres_bit_step_alloc && gres_js->gres_bit_step_alloc[node_offset]) { - bit_and_not(gres_bit_alloc, + bit_and_not(gres_bit_avail, gres_js->gres_bit_step_alloc[node_offset]); } - _set_step_gres_bit_alloc(gres_bit_alloc, len, &gres_alloc); + _set_step_gres_bit_alloc(gres_bit_alloc, gres_bit_avail, + len, &gres_alloc); + FREE_NULL_BITMAP(gres_bit_avail); } if (gres_alloc) { error("gres/%s: %s %ps oversubscribed resources on node %d", -- 2.32.0 From 8325e2d8f5f22569e78bb85c1c374f87a6dca3ac Mon Sep 17 00:00:00 2001 From: Scott Hilton Date: Tue, 22 Mar 2022 11:30:58 -0600 Subject: [PATCH 8/8] Pick step gres based on node topology Bug 13444 --- src/slurmctld/gres_ctld.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/slurmctld/gres_ctld.c b/src/slurmctld/gres_ctld.c index 27bb196956..bb9b1fe496 100644 --- a/src/slurmctld/gres_ctld.c +++ b/src/slurmctld/gres_ctld.c @@ -1968,13 +1968,19 @@ static uint64_t _step_get_gres_avail(gres_state_t *gres_state_job, return gres_avail; } -static void _set_step_gres_bit_alloc(bitstr_t *gres_bit_alloc, +static void _set_step_gres_bit_alloc(gres_job_state_t *gres_js, + gres_node_state_t *gres_ns, + bitstr_t *gres_bit_alloc, bitstr_t *gres_bit_avail, + bitstr_t *core_bitmap, int len, uint64_t *gres_alloc) { int i; for (i = 0; i < len && *gres_alloc; i++) { - if (!bit_test(gres_bit_avail, i)) + if (!bit_test(gres_bit_avail, i) || + bit_test(gres_bit_alloc, i) || + !_cores_on_gres(core_bitmap, NULL, gres_ns, i, + gres_js)) continue; bit_set(gres_bit_alloc, i); (*gres_alloc)--; @@ -1996,6 +2002,10 @@ static int _step_alloc(gres_step_state_t *gres_ss, uint64_t gres_alloc; bitstr_t *gres_bit_alloc, *gres_bit_avail; int i, len; + gres_state_t *gres_state_node = list_find_first( + node_record_table_ptr[node_offset].gres_list, gres_find_id, + &gres_state_job->plugin_id); + gres_node_state_t *gres_ns = gres_state_node->gres_data; xassert(gres_js); xassert(gres_ss); @@ -2098,8 +2108,18 @@ static int _step_alloc(gres_step_state_t *gres_ss, bit_and_not(gres_bit_avail, gres_js->gres_bit_step_alloc[node_offset]); } - _set_step_gres_bit_alloc(gres_bit_alloc, gres_bit_avail, - len, &gres_alloc); + /* Pass 1: Allocate GRES overlapping available cores */ + _set_step_gres_bit_alloc(gres_js, gres_ns, gres_bit_alloc, + gres_bit_avail, core_bitmap, len, + &gres_alloc); + if (gres_alloc) { + debug("cpus for optimal gres/%s topology unavailable for %ps allocating anyway.", + gres_state_job->gres_name, step_id); + } + /* Pass 2: Allocate any available GRES */ + _set_step_gres_bit_alloc(gres_js, gres_ns, gres_bit_alloc, + gres_bit_avail, NULL, len, + &gres_alloc); FREE_NULL_BITMAP(gres_bit_avail); } if (gres_alloc) { -- 2.32.0