View | Details | Raw Unified | Return to ticket 10679 | Differences between
and this patch

Collapse All | Expand All

(-)a/doc/man/man5/slurm.conf.5 (+5 lines)
Lines 3133-3138 baseboard, node and network switch). In order to optimize resource allocations Link Here
3133
on such hardware, Slurm will consider each NUMA node within the socket as a
3133
on such hardware, Slurm will consider each NUMA node within the socket as a
3134
separate socket by default. Use the Ignore_NUMA option to report the correct
3134
separate socket by default. Use the Ignore_NUMA option to report the correct
3135
socket count, but \fBnot\fR optimize resource allocations on the NUMA nodes.
3135
socket count, but \fBnot\fR optimize resource allocations on the NUMA nodes.
3136
3137
\fBNOTE\fR: Since hwloc 2.0 NUMA Nodes are are not part of the main/CPU topology tree,
3138
because of that if Slurm is build with hwloc 2.0 or above Slurm will treat
3139
HWLOC_OBJ_PACKAGE as Socket, you can change this behavior using
3140
\fBSlurmdParameters\fR=l3cache_as_socket.
3136
.TP
3141
.TP
3137
\fBinventory_interval=#\fR
3142
\fBinventory_interval=#\fR
3138
On a Cray system using Slurm on top of ALPS this limits the number of times
3143
On a Cray system using Slurm on top of ALPS this limits the number of times
(-)a/src/slurmd/common/xcpuinfo.c (-2 / +6 lines)
Lines 319-324 extern int xcpuinfo_hwloc_topo_get( Link Here
319
	objtype[SOCKET] = HWLOC_OBJ_SOCKET;
319
	objtype[SOCKET] = HWLOC_OBJ_SOCKET;
320
	objtype[CORE]   = HWLOC_OBJ_CORE;
320
	objtype[CORE]   = HWLOC_OBJ_CORE;
321
	objtype[PU]     = HWLOC_OBJ_PU;
321
	objtype[PU]     = HWLOC_OBJ_PU;
322
#if HWLOC_API_VERSION >= 0x00020000
323
	if (xstrcasestr(slurm_conf.sched_params, "Ignore_NUMA")) {
324
		info("SchedulerParamaters=Ignore_NUMA not supported by hwloc v2");
325
	}
326
#else
322
	if (hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
327
	if (hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
323
	    hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)) {
328
	    hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)) {
324
		if (xstrcasestr(slurm_conf.sched_params, "Ignore_NUMA")) {
329
		if (xstrcasestr(slurm_conf.sched_params, "Ignore_NUMA")) {
Lines 328-333 extern int xcpuinfo_hwloc_topo_get( Link Here
328
			objtype[SOCKET] = HWLOC_OBJ_NODE;
333
			objtype[SOCKET] = HWLOC_OBJ_NODE;
329
		}
334
		}
330
	}
335
	}
336
#endif
331
337
332
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket")) {
338
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket")) {
333
#if HWLOC_API_VERSION >= 0x00020000
339
#if HWLOC_API_VERSION >= 0x00020000
334
- 
335
duplication
340
duplication
336
--
337
src/plugins/task/cgroup/task_cgroup_cpuset.c | 18 +++------------
341
src/plugins/task/cgroup/task_cgroup_cpuset.c | 18 +++------------
338
src/slurmd/common/xcpuinfo.c                 | 23 ++++++++++++--------
342
src/slurmd/common/xcpuinfo.c                 | 23 ++++++++++++--------
339
src/slurmd/common/xcpuinfo.h                 |  7 ++++++
343
src/slurmd/common/xcpuinfo.h                 |  7 ++++++
340
3 files changed, 24 insertions(+), 24 deletions(-)
344
3 files changed, 24 insertions(+), 24 deletions(-)
(-)a/src/plugins/task/cgroup/task_cgroup_cpuset.c (-15 / +3 lines)
Lines 447-457 static int _task_cgroup_cpuset_dist_cyclic( Link Here
447
	uint32_t obj_idxs[3], cps, tpc, i, j, sock_loop, ntskip, npdist;
447
	uint32_t obj_idxs[3], cps, tpc, i, j, sock_loop, ntskip, npdist;
448
	bool core_cyclic, core_fcyclic, sock_fcyclic;
448
	bool core_cyclic, core_fcyclic, sock_fcyclic;
449
	bool hwloc_success = true;
449
	bool hwloc_success = true;
450
	hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
450
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket();
451
#if HWLOC_API_VERSION >= 0x00020000
452
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket"))
453
		socket_type = HWLOC_OBJ_L3CACHE;
454
#endif
455
451
456
	/*
452
	/*
457
	 * We can't trust the slurmd_conf_t *conf here as we need actual
453
	 * We can't trust the slurmd_conf_t *conf here as we need actual
Lines 703-713 static int _task_cgroup_cpuset_dist_block( Link Here
703
	uint32_t core_idx;
699
	uint32_t core_idx;
704
	bool core_fcyclic, core_block;
700
	bool core_fcyclic, core_block;
705
701
706
	hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
702
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket();
707
#if HWLOC_API_VERSION >= 0x00020000
708
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket"))
709
		socket_type = HWLOC_OBJ_L3CACHE;
710
#endif
711
703
712
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
704
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
713
						       socket_type);
705
						       socket_type);
Lines 1022-1032 extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job, Link Here
1022
	uint32_t jnpus;
1014
	uint32_t jnpus;
1023
	int spec_threads = 0;
1015
	int spec_threads = 0;
1024
1016
1025
	hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
1017
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket();
1026
#if HWLOC_API_VERSION >= 0x00020000
1027
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket"))
1028
		socket_type = HWLOC_OBJ_L3CACHE;
1029
#endif
1030
1018
1031
	/* Allocate and initialize hwloc objects */
1019
	/* Allocate and initialize hwloc objects */
1032
	hwloc_topology_init(&topology);
1020
	hwloc_topology_init(&topology);
(-)a/src/slurmd/common/xcpuinfo.c (-9 / +14 lines)
Lines 254-259 handle_write: Link Here
254
	return ret;
254
	return ret;
255
}
255
}
256
256
257
/* Return a hwloc object used as a socket */
258
extern hwloc_obj_type_t slurmd_parameter_as_socket(void)
259
{
260
	hwloc_obj_type_t obj_as_socket = HWLOC_OBJ_SOCKET;
261
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket")) {
262
#if HWLOC_API_VERSION >= 0x00020000
263
		obj_as_socket = HWLOC_OBJ_L3CACHE;
264
#else
265
		error("SlurmdParameters=l3cache_as_socket requires hwloc v2");
266
#endif
267
	}
268
	return obj_as_socket;
269
}
257
/*
270
/*
258
 * xcpuinfo_hwloc_topo_get - Return detailed cpuinfo on the whole system
271
 * xcpuinfo_hwloc_topo_get - Return detailed cpuinfo on the whole system
259
 * Output: p_cpus - number of processors on the system
272
 * Output: p_cpus - number of processors on the system
Lines 316-322 extern int xcpuinfo_hwloc_topo_get( Link Here
316
	 * in which case Slurm will report the correct socket count on the node,
329
	 * in which case Slurm will report the correct socket count on the node,
317
	 * but not be able to optimize resource allocations on the NUMA nodes.
330
	 * but not be able to optimize resource allocations on the NUMA nodes.
318
	 */
331
	 */
319
	objtype[SOCKET] = HWLOC_OBJ_SOCKET;
332
	objtype[SOCKET] = slurmd_parameter_as_socket();
320
	objtype[CORE]   = HWLOC_OBJ_CORE;
333
	objtype[CORE]   = HWLOC_OBJ_CORE;
321
	objtype[PU]     = HWLOC_OBJ_PU;
334
	objtype[PU]     = HWLOC_OBJ_PU;
322
#if HWLOC_API_VERSION >= 0x00020000
335
#if HWLOC_API_VERSION >= 0x00020000
Lines 335-348 extern int xcpuinfo_hwloc_topo_get( Link Here
335
	}
348
	}
336
#endif
349
#endif
337
350
338
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket")) {
339
#if HWLOC_API_VERSION >= 0x00020000
340
		objtype[SOCKET] = HWLOC_OBJ_L3CACHE;
341
#else
342
		error("SlurmdParameters=l3cache_as_socket requires hwloc v2");
343
#endif
344
	}
345
346
	/* Groups below root obj are interpreted as boards */
351
	/* Groups below root obj are interpreted as boards */
347
	obj = hwloc_get_root_obj(topology);
352
	obj = hwloc_get_root_obj(topology);
348
	obj = hwloc_get_next_child(topology, obj, NULL);
353
	obj = hwloc_get_next_child(topology, obj, NULL);
(-)a/src/slurmd/common/xcpuinfo.h (-2 / +7 lines)
Lines 40-51 Link Here
40
#define XCPUINFO_ERROR    1
40
#define XCPUINFO_ERROR    1
41
#define XCPUINFO_SUCCESS  0
41
#define XCPUINFO_SUCCESS  0
42
42
43
#ifdef HAVE_HWLOC
44
#include <hwloc.h>
45
/* Return a hwloc object used as a socket */
46
extern hwloc_obj_type_t slurmd_parameter_as_socket(void);
47
#endif
48
43
extern int get_procs(uint16_t *procs);
49
extern int get_procs(uint16_t *procs);
44
50
45
/* read or load topology and write if needed
51
/* read or load topology and write if needed
46
 * init and destroy topology must be outside this function */
52
 * init and destroy topology must be outside this function */
47
extern int xcpuinfo_hwloc_topo_load(
53
extern int xcpuinfo_hwloc_topo_load(
48
	void *topology_in, char *topo_file, bool full);
54
	void *topology_in, char *topo_file, bool full);
55
49
/*
56
/*
50
 * Get the node's cpu info.
57
 * Get the node's cpu info.
51
 *
58
 *
52
- 
53
--
54
doc/man/man5/slurm.conf.5                    |  9 ++++++++
59
doc/man/man5/slurm.conf.5                    |  9 ++++++++
55
src/plugins/task/cgroup/task_cgroup_cpuset.c |  7 +++---
60
src/plugins/task/cgroup/task_cgroup_cpuset.c |  7 +++---
56
src/slurmd/common/xcpuinfo.c                 | 23 ++++++++++++++++++--
61
src/slurmd/common/xcpuinfo.c                 | 23 ++++++++++++++++++--
57
src/slurmd/common/xcpuinfo.h                 |  2 +-
62
src/slurmd/common/xcpuinfo.h                 |  2 +-
58
4 files changed, 35 insertions(+), 6 deletions(-)
63
4 files changed, 35 insertions(+), 6 deletions(-)
(-)a/doc/man/man5/slurm.conf.5 (+9 lines)
Lines 3893-3898 Equivalent to the now deprecated FastSchedule=2 option. Link Here
3893
Use the hwloc l3cache as the socket count. Can be useful on certain processors
3893
Use the hwloc l3cache as the socket count. Can be useful on certain processors
3894
where the socket level is too coarse, and the l3cache may provide better
3894
where the socket level is too coarse, and the l3cache may provide better
3895
task distribution. (E.g., along CCX boundaries instead of socket boundaries.)
3895
task distribution. (E.g., along CCX boundaries instead of socket boundaries.)
3896
Mutually exclusive with numa_node_as_socket.
3897
Requires hwloc v2.
3898
.TP
3899
\fBnuma_node_as_socket\fR
3900
Use the hwloc NUMA Node to determine main hierarchy object to be used as socket.
3901
If the option is set Slurm will check the parent object of NUMA Noda and use it
3902
as socket. This option may be usefull for architectures likes AMD Epyc, where
3903
number of nodes per socket may be configured.
3904
Mutually exclusive with l3cache_as_socket.
3896
Requires hwloc v2.
3905
Requires hwloc v2.
3897
.TP
3906
.TP
3898
\fBshutdown_on_reboot\fR
3907
\fBshutdown_on_reboot\fR
(-)a/src/plugins/task/cgroup/task_cgroup_cpuset.c (-3 / +4 lines)
Lines 447-453 static int _task_cgroup_cpuset_dist_cyclic( Link Here
447
	uint32_t obj_idxs[3], cps, tpc, i, j, sock_loop, ntskip, npdist;
447
	uint32_t obj_idxs[3], cps, tpc, i, j, sock_loop, ntskip, npdist;
448
	bool core_cyclic, core_fcyclic, sock_fcyclic;
448
	bool core_cyclic, core_fcyclic, sock_fcyclic;
449
	bool hwloc_success = true;
449
	bool hwloc_success = true;
450
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket();
450
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket(topology);
451
451
452
	/*
452
	/*
453
	 * We can't trust the slurmd_conf_t *conf here as we need actual
453
	 * We can't trust the slurmd_conf_t *conf here as we need actual
Lines 699-705 static int _task_cgroup_cpuset_dist_block( Link Here
699
	uint32_t core_idx;
699
	uint32_t core_idx;
700
	bool core_fcyclic, core_block;
700
	bool core_fcyclic, core_block;
701
701
702
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket();
702
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket(topology);
703
703
704
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
704
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
705
						       socket_type);
705
						       socket_type);
Lines 1014-1024 extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job, Link Here
1014
	uint32_t jnpus;
1014
	uint32_t jnpus;
1015
	int spec_threads = 0;
1015
	int spec_threads = 0;
1016
1016
1017
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket();
1018
1017
1019
	/* Allocate and initialize hwloc objects */
1018
	/* Allocate and initialize hwloc objects */
1020
	hwloc_topology_init(&topology);
1019
	hwloc_topology_init(&topology);
1021
1020
1021
	hwloc_obj_type_t socket_type = slurmd_parameter_as_socket(topology);
1022
1022
	xassert(conf->hwloc_xml);
1023
	xassert(conf->hwloc_xml);
1023
	xcpuinfo_hwloc_topo_load(&topology, conf->hwloc_xml, false);
1024
	xcpuinfo_hwloc_topo_load(&topology, conf->hwloc_xml, false);
1024
1025
(-)a/src/slurmd/common/xcpuinfo.c (-2 / +21 lines)
Lines 255-270 handle_write: Link Here
255
}
255
}
256
256
257
/* Return a hwloc object used as a socket */
257
/* Return a hwloc object used as a socket */
258
extern hwloc_obj_type_t slurmd_parameter_as_socket(void)
258
extern hwloc_obj_type_t slurmd_parameter_as_socket(hwloc_topology_t topology)
259
{
259
{
260
	hwloc_obj_type_t obj_as_socket = HWLOC_OBJ_SOCKET;
260
	hwloc_obj_type_t obj_as_socket = HWLOC_OBJ_SOCKET;
261
261
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket")) {
262
	if (xstrcasestr(slurm_conf.slurmd_params, "l3cache_as_socket")) {
262
#if HWLOC_API_VERSION >= 0x00020000
263
#if HWLOC_API_VERSION >= 0x00020000
263
		obj_as_socket = HWLOC_OBJ_L3CACHE;
264
		obj_as_socket = HWLOC_OBJ_L3CACHE;
264
#else
265
#else
265
		error("SlurmdParameters=l3cache_as_socket requires hwloc v2");
266
		error("SlurmdParameters=l3cache_as_socket requires hwloc v2");
267
#endif
268
	} else if (xstrcasestr(slurm_conf.slurmd_params,
269
			       "numa_node_as_socket")) {
270
#if HWLOC_API_VERSION >= 0x00020000
271
		hwloc_obj_t obj = hwloc_get_next_obj_by_type(topology,
272
							     HWLOC_OBJ_NODE,
273
							     NULL);
274
		if (obj != NULL && obj->parent != NULL) {
275
			char tmp[128];
276
			obj_as_socket = obj->parent->type;
277
			hwloc_obj_type_snprintf(tmp, sizeof(tmp), obj->parent,
278
						0);
279
			debug2("%s: numa_node_as_socket mapped to '%s'",
280
			       __func__, tmp);
281
		}
282
#else
283
		error("SlurmdParameters=numa_node_as_socket requires hwloc v2");
266
#endif
284
#endif
267
	}
285
	}
286
268
	return obj_as_socket;
287
	return obj_as_socket;
269
}
288
}
270
/*
289
/*
Lines 329-335 extern int xcpuinfo_hwloc_topo_get( Link Here
329
	 * in which case Slurm will report the correct socket count on the node,
348
	 * in which case Slurm will report the correct socket count on the node,
330
	 * but not be able to optimize resource allocations on the NUMA nodes.
349
	 * but not be able to optimize resource allocations on the NUMA nodes.
331
	 */
350
	 */
332
	objtype[SOCKET] = slurmd_parameter_as_socket();
351
	objtype[SOCKET] = slurmd_parameter_as_socket(topology);
333
	objtype[CORE]   = HWLOC_OBJ_CORE;
352
	objtype[CORE]   = HWLOC_OBJ_CORE;
334
	objtype[PU]     = HWLOC_OBJ_PU;
353
	objtype[PU]     = HWLOC_OBJ_PU;
335
#if HWLOC_API_VERSION >= 0x00020000
354
#if HWLOC_API_VERSION >= 0x00020000
(-)a/src/slurmd/common/xcpuinfo.h (-2 / +1 lines)
Lines 43-49 Link Here
43
#ifdef HAVE_HWLOC
43
#ifdef HAVE_HWLOC
44
#include <hwloc.h>
44
#include <hwloc.h>
45
/* Return a hwloc object used as a socket */
45
/* Return a hwloc object used as a socket */
46
extern hwloc_obj_type_t slurmd_parameter_as_socket(void);
46
extern hwloc_obj_type_t slurmd_parameter_as_socket(hwloc_topology_t topology);
47
#endif
47
#endif
48
48
49
extern int get_procs(uint16_t *procs);
49
extern int get_procs(uint16_t *procs);
50
- 

Return to ticket 10679