View | Details | Raw Unified | Return to ticket 12975
Collapse All | Expand All

(-)a/src/common/gres.c (-4 / +3 lines)
Lines 308-314 static void _validate_slurm_conf(List slurm_conf_list, Link Here
308
static void	_validate_gres_conf(List gres_conf_list,
308
static void	_validate_gres_conf(List gres_conf_list,
309
				    slurm_gres_context_t *context_ptr);
309
				    slurm_gres_context_t *context_ptr);
310
static int	_validate_file(char *path_name, char *gres_name);
310
static int	_validate_file(char *path_name, char *gres_name);
311
static void	_validate_links(gres_slurmd_conf_t *p);
312
static void	_validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
311
static void	_validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
313
					  int cpus_ctld, char *node_name);
312
					  int cpus_ctld, char *node_name);
314
static int	_valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
313
static int	_valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
Lines 963-969 static int _validate_file(char *filenames, char *gres_name) Link Here
963
/*
962
/*
964
 * Check that we have a comma-delimited list of numbers
963
 * Check that we have a comma-delimited list of numbers
965
 */
964
 */
966
static void _validate_links(gres_slurmd_conf_t *p)
965
extern void gres_links_validate(gres_slurmd_conf_t *p)
967
{
966
{
968
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
967
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
969
	long int val;
968
	long int val;
Lines 1200-1206 static int _parse_gres_config(void **dest, slurm_parser_enum_t type, Link Here
1200
1199
1201
	if (s_p_get_string(&p->links, "Link",  tbl) ||
1200
	if (s_p_get_string(&p->links, "Link",  tbl) ||
1202
	    s_p_get_string(&p->links, "Links", tbl)) {
1201
	    s_p_get_string(&p->links, "Links", tbl)) {
1203
		_validate_links(p);
1202
		gres_links_validate(p);
1204
	}
1203
	}
1205
1204
1206
	if (s_p_get_string(&p->type_name, "Type", tbl)) {
1205
	if (s_p_get_string(&p->type_name, "Type", tbl)) {
Lines 2300-2306 extern int gres_plugin_node_config_unpack(Buf buffer, char *node_name) Link Here
2300
		p->type_name = tmp_type;
2299
		p->type_name = tmp_type;
2301
		tmp_type = NULL;	/* Nothing left to xfree */
2300
		tmp_type = NULL;	/* Nothing left to xfree */
2302
		p->plugin_id = plugin_id;
2301
		p->plugin_id = plugin_id;
2303
		_validate_links(p);
2302
		gres_links_validate(p);
2304
		list_append(gres_conf_list, p);
2303
		list_append(gres_conf_list, p);
2305
	}
2304
	}
2306
2305
(-)a/src/common/gres.h (-2 / +5 lines)
Lines 1454-1457 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt, Link Here
1454
			     bitstr_t *cpu_aff_mac_bitstr, char *device_file,
1454
			     bitstr_t *cpu_aff_mac_bitstr, char *device_file,
1455
			     char *type, char *links);
1455
			     char *type, char *links);
1456
1456
1457
/*
1458
 * Check that we have a comma-delimited list of numbers
1459
 */
1460
extern void gres_links_validate(gres_slurmd_conf_t *p);
1461
1457
#endif /* !_GRES_H */
1462
#endif /* !_GRES_H */
1458
- 
1459
--
1460
src/common/gres.c | 31 ++++++++++++++++++++++++-------
1463
src/common/gres.c | 31 ++++++++++++++++++++++++-------
1461
src/common/gres.h | 11 +++++++++--
1464
src/common/gres.h | 11 +++++++++--
1462
2 files changed, 33 insertions(+), 9 deletions(-)
1465
2 files changed, 33 insertions(+), 9 deletions(-)
(-)a/src/common/gres.c (-7 / +24 lines)
Lines 961-993 static int _validate_file(char *filenames, char *gres_name) Link Here
961
961
962
/*
962
/*
963
 * Check that we have a comma-delimited list of numbers
963
 * Check that we have a comma-delimited list of numbers
964
 *
965
 * Return values:
966
 *  0: success.
967
 * -1: links string is NULL.
968
 * -2: links string is not NULL, but is invalid. Possible invalid reasons:
969
 *     * error parsing the comma-delimited links string
970
 *     * links string is an empty string
964
 */
971
 */
965
extern void gres_links_validate(gres_slurmd_conf_t *p)
972
extern int gres_links_validate(gres_slurmd_conf_t *p)
966
{
973
{
967
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
974
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
968
	long int val;
975
	long int val;
976
	int rc;
969
977
970
	if (!p->links)
978
	if (!p->links)
971
		return;
979
		return -1;
972
	if (p->links[0] == '\0') {
980
	if (p->links[0] == '\0') {
981
		error("%s: Links is an empty string", __func__);
973
		xfree(p->links);
982
		xfree(p->links);
974
		return;
983
		return -2;
975
	}
984
	}
976
985
977
	tmp = xstrdup(p->links);
986
	tmp = xstrdup(p->links);
978
	tok = strtok_r(tmp, ",", &save_ptr);
987
	tok = strtok_r(tmp, ",", &save_ptr);
988
	rc = 0;
979
	while (tok) {
989
	while (tok) {
980
		val = strtol(tok, &end_ptr, 10);
990
		val = strtol(tok, &end_ptr, 10);
981
		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
991
		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
982
		    (end_ptr[0] != '\0')) {
992
		    (end_ptr[0] != '\0')) {
983
			error("gres.conf: Ignoring invalid Link (%s) for Name=%s",
993
			error("%s: Failed to parse token '%s' in links string '%s'",
984
			      tok, p->name);
994
			      __func__, tok, p->links);
995
			rc = -2;
985
			xfree(p->links);
996
			xfree(p->links);
986
			break;
997
			break;
987
		}
998
		}
988
		tok = strtok_r(NULL, ",", &save_ptr);
999
		tok = strtok_r(NULL, ",", &save_ptr);
989
	}
1000
	}
990
	xfree(tmp);
1001
	xfree(tmp);
1002
	return rc;
991
}
1003
}
992
1004
993
/*
1005
/*
Lines 1199-1205 static int _parse_gres_config(void **dest, slurm_parser_enum_t type, Link Here
1199
1211
1200
	if (s_p_get_string(&p->links, "Link",  tbl) ||
1212
	if (s_p_get_string(&p->links, "Link",  tbl) ||
1201
	    s_p_get_string(&p->links, "Links", tbl)) {
1213
	    s_p_get_string(&p->links, "Links", tbl)) {
1202
		gres_links_validate(p);
1214
		if (gres_links_validate(p) < -1)
1215
			error("gres.conf: Ignoring invalid Links=%s for Name=%s",
1216
			      p->links, p->name);
1217
1203
	}
1218
	}
1204
1219
1205
	if (s_p_get_string(&p->type_name, "Type", tbl)) {
1220
	if (s_p_get_string(&p->type_name, "Type", tbl)) {
Lines 2299-2305 extern int gres_plugin_node_config_unpack(Buf buffer, char *node_name) Link Here
2299
		p->type_name = tmp_type;
2314
		p->type_name = tmp_type;
2300
		tmp_type = NULL;	/* Nothing left to xfree */
2315
		tmp_type = NULL;	/* Nothing left to xfree */
2301
		p->plugin_id = plugin_id;
2316
		p->plugin_id = plugin_id;
2302
		gres_links_validate(p);
2317
		if (gres_links_validate(p) < -1)
2318
			error("%s: Ignoring invalid Links=%s for Name=%s",
2319
			      __func__, p->links, p->name);
2303
		list_append(gres_conf_list, p);
2320
		list_append(gres_conf_list, p);
2304
	}
2321
	}
2305
2322
(-)a/src/common/gres.h (-4 / +9 lines)
Lines 1456-1462 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt, Link Here
1456
1456
1457
/*
1457
/*
1458
 * Check that we have a comma-delimited list of numbers
1458
 * Check that we have a comma-delimited list of numbers
1459
 */
1459
 *
1460
extern void gres_links_validate(gres_slurmd_conf_t *p);
1460
 * Return values:
1461
 *  0: success.
1462
 * -1: links string is NULL.
1463
 * -2: links string is not NULL, but is invalid. Possible invalid reasons:
1464
 *     * error parsing the comma-delimited links string
1465
 *     * links string is an empty string
1466
 */
1467
extern int gres_links_validate(gres_slurmd_conf_t *p);
1461
1468
1462
#endif /* !_GRES_H */
1469
#endif /* !_GRES_H */
1463
- 
1464
--
1465
src/common/gres.c | 20 +++++++++++---------
1470
src/common/gres.c | 20 +++++++++++---------
1466
src/common/gres.h |  2 +-
1471
src/common/gres.h |  2 +-
1467
2 files changed, 12 insertions(+), 10 deletions(-)
1472
2 files changed, 12 insertions(+), 10 deletions(-)
(-)a/src/common/gres.c (-9 / +11 lines)
Lines 969-989 static int _validate_file(char *filenames, char *gres_name) Link Here
969
 *     * error parsing the comma-delimited links string
969
 *     * error parsing the comma-delimited links string
970
 *     * links string is an empty string
970
 *     * links string is an empty string
971
 */
971
 */
972
extern int gres_links_validate(gres_slurmd_conf_t *p)
972
extern int gres_links_validate(char *links)
973
{
973
{
974
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
974
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
975
	long int val;
975
	long int val;
976
	int rc;
976
	int rc;
977
977
978
	if (!p->links)
978
	if (!links)
979
		return -1;
979
		return -1;
980
	if (p->links[0] == '\0') {
980
	if (links[0] == '\0') {
981
		error("%s: Links is an empty string", __func__);
981
		error("%s: Links is an empty string", __func__);
982
		xfree(p->links);
983
		return -2;
982
		return -2;
984
	}
983
	}
985
984
986
	tmp = xstrdup(p->links);
985
	tmp = xstrdup(links);
987
	tok = strtok_r(tmp, ",", &save_ptr);
986
	tok = strtok_r(tmp, ",", &save_ptr);
988
	rc = 0;
987
	rc = 0;
989
	while (tok) {
988
	while (tok) {
Lines 991-999 extern int gres_links_validate(gres_slurmd_conf_t *p) Link Here
991
		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
990
		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
992
		    (end_ptr[0] != '\0')) {
991
		    (end_ptr[0] != '\0')) {
993
			error("%s: Failed to parse token '%s' in links string '%s'",
992
			error("%s: Failed to parse token '%s' in links string '%s'",
994
			      __func__, tok, p->links);
993
			      __func__, tok, links);
995
			rc = -2;
994
			rc = -2;
996
			xfree(p->links);
997
			break;
995
			break;
998
		}
996
		}
999
		tok = strtok_r(NULL, ",", &save_ptr);
997
		tok = strtok_r(NULL, ",", &save_ptr);
Lines 1211-1219 static int _parse_gres_config(void **dest, slurm_parser_enum_t type, Link Here
1211
1209
1212
	if (s_p_get_string(&p->links, "Link",  tbl) ||
1210
	if (s_p_get_string(&p->links, "Link",  tbl) ||
1213
	    s_p_get_string(&p->links, "Links", tbl)) {
1211
	    s_p_get_string(&p->links, "Links", tbl)) {
1214
		if (gres_links_validate(p) < -1)
1212
		if (gres_links_validate(p->links) < -1) {
1215
			error("gres.conf: Ignoring invalid Links=%s for Name=%s",
1213
			error("gres.conf: Ignoring invalid Links=%s for Name=%s",
1216
			      p->links, p->name);
1214
			      p->links, p->name);
1215
			xfree(p->links);
1216
		}
1217
1217
1218
	}
1218
	}
1219
1219
Lines 2314-2322 extern int gres_plugin_node_config_unpack(Buf buffer, char *node_name) Link Here
2314
		p->type_name = tmp_type;
2314
		p->type_name = tmp_type;
2315
		tmp_type = NULL;	/* Nothing left to xfree */
2315
		tmp_type = NULL;	/* Nothing left to xfree */
2316
		p->plugin_id = plugin_id;
2316
		p->plugin_id = plugin_id;
2317
		if (gres_links_validate(p) < -1)
2317
		if (gres_links_validate(p->links) < -1) {
2318
			error("%s: Ignoring invalid Links=%s for Name=%s",
2318
			error("%s: Ignoring invalid Links=%s for Name=%s",
2319
			      __func__, p->links, p->name);
2319
			      __func__, p->links, p->name);
2320
			xfree(p->links);
2321
		}
2320
		list_append(gres_conf_list, p);
2322
		list_append(gres_conf_list, p);
2321
	}
2323
	}
2322
2324
(-)a/src/common/gres.h (-3 / +1 lines)
Lines 1464-1469 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt, Link Here
1464
 *     * error parsing the comma-delimited links string
1464
 *     * error parsing the comma-delimited links string
1465
 *     * links string is an empty string
1465
 *     * links string is an empty string
1466
 */
1466
 */
1467
extern int gres_links_validate(gres_slurmd_conf_t *p);
1467
extern int gres_links_validate(char *links);
1468
1468
1469
#endif /* !_GRES_H */
1469
#endif /* !_GRES_H */
1470
- 
1471
gres_links_validate()
1470
gres_links_validate()
1472
--
1473
src/common/gres.c | 31 ++++++++++++++++++++++++++++---
1471
src/common/gres.c | 31 ++++++++++++++++++++++++++++---
1474
src/common/gres.h |  9 +++++++--
1472
src/common/gres.h |  9 +++++++--
1475
2 files changed, 35 insertions(+), 5 deletions(-)
1473
2 files changed, 35 insertions(+), 5 deletions(-)
(-)a/src/common/gres.c (-3 / +28 lines)
Lines 960-979 static int _validate_file(char *filenames, char *gres_name) Link Here
960
}
960
}
961
961
962
/*
962
/*
963
 * Check that we have a comma-delimited list of numbers
963
 * Check that we have a comma-delimited list of numbers, and return the index of
964
 * the GPU (-1) in the links string.
964
 *
965
 *
966
 * Returns a non-zero-based index of the GPU in the links string, if found.
967
 * If not found, returns a negative value.
965
 * Return values:
968
 * Return values:
966
 *  0: success.
969
 * 0+: GPU index
967
 * -1: links string is NULL.
970
 * -1: links string is NULL.
968
 * -2: links string is not NULL, but is invalid. Possible invalid reasons:
971
 * -2: links string is not NULL, but is invalid. Possible invalid reasons:
969
 *     * error parsing the comma-delimited links string
972
 *     * error parsing the comma-delimited links string
970
 *     * links string is an empty string
973
 *     * links string is an empty string
974
 *     * the 'self' GPU identifier isn't found (i.e. no -1)
975
 *     * there is more than one 'self' GPU identifier found
971
 */
976
 */
972
extern int gres_links_validate(char *links)
977
extern int gres_links_validate(char *links)
973
{
978
{
974
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
979
	char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
975
	long int val;
980
	long int val;
976
	int rc;
981
	int rc;
982
	int i;
977
983
978
	if (!links)
984
	if (!links)
979
		return -1;
985
		return -1;
Lines 984-990 extern int gres_links_validate(char *links) Link Here
984
990
985
	tmp = xstrdup(links);
991
	tmp = xstrdup(links);
986
	tok = strtok_r(tmp, ",", &save_ptr);
992
	tok = strtok_r(tmp, ",", &save_ptr);
987
	rc = 0;
993
	rc = -1;
994
	i = 0;
988
	while (tok) {
995
	while (tok) {
989
		val = strtol(tok, &end_ptr, 10);
996
		val = strtol(tok, &end_ptr, 10);
990
		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
997
		if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
Lines 994-1002 extern int gres_links_validate(char *links) Link Here
994
			rc = -2;
1001
			rc = -2;
995
			break;
1002
			break;
996
		}
1003
		}
1004
		if (val == -1) {
1005
			if (rc != -1) {
1006
				error("%s: links string '%s' has more than one -1",
1007
				      __func__, links);
1008
				rc = -2;
1009
				break;
1010
			}
1011
			rc = i;
1012
		}
1013
		i++;
997
		tok = strtok_r(NULL, ",", &save_ptr);
1014
		tok = strtok_r(NULL, ",", &save_ptr);
998
	}
1015
	}
999
	xfree(tmp);
1016
	xfree(tmp);
1017
1018
	/* If the current GPU (-1) wasn't found, that's an error */
1019
	if (rc == -1) {
1020
		error("%s: -1 wasn't found in links string '%s'", __func__,
1021
		      links);
1022
		rc = -2;
1023
	}
1024
1000
	return rc;
1025
	return rc;
1001
}
1026
}
1002
1027
(-)a/src/common/gres.h (-4 / +7 lines)
Lines 1455-1468 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt, Link Here
1455
			     char *type, char *links);
1455
			     char *type, char *links);
1456
1456
1457
/*
1457
/*
1458
 * Check that we have a comma-delimited list of numbers
1458
 * Check that we have a comma-delimited list of numbers, and return the index of
1459
 * the GPU (-1) in the links string.
1459
 *
1460
 *
1461
 * Returns a non-zero-based index of the GPU in the links string, if found.
1462
 * If not found, returns a negative value.
1460
 * Return values:
1463
 * Return values:
1461
 *  0: success.
1464
 * 0+: GPU index
1462
 * -1: links string is NULL.
1465
 * -1: links string is NULL.
1463
 * -2: links string is not NULL, but is invalid. Possible invalid reasons:
1466
 * -2: links string is not NULL, but is invalid. Possible invalid reasons:
1464
 *     * error parsing the comma-delimited links string
1467
 *     * error parsing the comma-delimited links string
1465
 *     * links string is an empty string
1468
 *     * links string is an empty string
1469
 *     * the 'self' GPU identifier isn't found (i.e. no -1)
1470
 *     * there is more than one 'self' GPU identifier found
1466
 */
1471
 */
1467
extern int gres_links_validate(char *links);
1472
extern int gres_links_validate(char *links);
1468
1473
1469
- 
1470
links
1474
links
1471
--
1472
src/plugins/gres/gpu/gres_gpu.c | 39 +++++++++++++++++++++++++++++++++
1475
src/plugins/gres/gpu/gres_gpu.c | 39 +++++++++++++++++++++++++++++++++
1473
1 file changed, 39 insertions(+)
1476
1 file changed, 39 insertions(+)
(-)a/src/plugins/gres/gpu/gres_gpu.c (-2 / +39 lines)
Lines 371-376 static int _sort_gpu_by_file_asc(void *x, void *y) Link Here
371
	return _sort_gpu_by_file(x, y, true);
371
	return _sort_gpu_by_file(x, y, true);
372
}
372
}
373
373
374
/*
375
 * Sort GPUs by the order they are specified in links.
376
 *
377
 * It is assumed that each links string has a -1 to indicate the position of the
378
 * current GPU at the position it was enumerated in. The GPUs will be sorted so
379
 * the links matrix looks like this:
380
 *
381
 * -1, 0, ...  0, 0
382
 *  0,-1, ...  0, 0
383
 *  0, 0, ... -1, 0
384
 *  0, 0, ...  0,-1
385
 *
386
 * This should preserve the original enumeration order of NVML (which is in
387
 * order of PCI bus ID).
388
 */
389
static int _sort_gpu_by_links_order(void *x, void *y)
390
{
391
	gres_slurmd_conf_t *gres_record_x = *(gres_slurmd_conf_t **)x;
392
	gres_slurmd_conf_t *gres_record_y = *(gres_slurmd_conf_t **)y;
393
	int index_x, index_y;
394
395
	/* Make null links appear last in sort */
396
	if (!gres_record_x->links && gres_record_y->links)
397
		return 1;
398
	if (gres_record_x->links && !gres_record_y->links)
399
		return -1;
400
401
	index_x = gres_links_validate(gres_record_x->links);
402
	index_y = gres_links_validate(gres_record_y->links);
403
404
	if (index_x < -1 || index_y < -1)
405
		error("%s: invalid links value found", __func__);
406
407
	return (index_x - index_y);
408
}
409
374
/*
410
/*
375
 * Takes the merged [slurm|gres].conf records in gres_list_conf and the GPU
411
 * Takes the merged [slurm|gres].conf records in gres_list_conf and the GPU
376
 * devices detected on the node in gres_list_system and returns a final merged
412
 * devices detected on the node in gres_list_system and returns a final merged
Lines 584-590 static void _normalize_gres_conf(List gres_list_conf, List gres_list_system) Link Here
584
	/* Add GPUs + non-GPUs to gres_list_conf */
620
	/* Add GPUs + non-GPUs to gres_list_conf */
585
	list_flush(gres_list_conf);
621
	list_flush(gres_list_conf);
586
	if (gres_list_gpu && list_count(gres_list_gpu)) {
622
	if (gres_list_gpu && list_count(gres_list_gpu)) {
623
		/* Sort by device file first, in case no links */
587
		list_sort(gres_list_gpu, _sort_gpu_by_file_asc);
624
		list_sort(gres_list_gpu, _sort_gpu_by_file_asc);
625
		/* Sort by links, which is a stand-in for PCI bus ID order */
626
		list_sort(gres_list_gpu, _sort_gpu_by_links_order);
588
		debug2("gres_list_gpu");
627
		debug2("gres_list_gpu");
589
		print_gres_list(gres_list_gpu, LOG_LEVEL_DEBUG2);
628
		print_gres_list(gres_list_gpu, LOG_LEVEL_DEBUG2);
590
		list_transfer(gres_list_conf, gres_list_gpu);
629
		list_transfer(gres_list_conf, gres_list_gpu);
591
- 
592
enumeration order
630
enumeration order
593
--
594
src/common/gres.c               | 27 +++++++++++++++++++++++++++
631
src/common/gres.c               | 27 +++++++++++++++++++++++++++
595
src/common/gres.h               | 16 ++++++++++++++++
632
src/common/gres.h               | 16 ++++++++++++++++
596
src/plugins/gpu/rsmi/gpu_rsmi.c |  9 +++++++--
633
src/plugins/gpu/rsmi/gpu_rsmi.c |  9 +++++++--
597
3 files changed, 50 insertions(+), 2 deletions(-)
634
3 files changed, 50 insertions(+), 2 deletions(-)
(-)a/src/common/gres.c (+27 lines)
Lines 959-964 static int _validate_file(char *filenames, char *gres_name) Link Here
959
	return file_count;
959
	return file_count;
960
}
960
}
961
961
962
/*
963
 * Create and return a comma-separated zeroed-out links string with a -1 in the
964
 * given GPU position indicated by index. Caller must xfree() the returned
965
 * string.
966
 *
967
 * Used to record the enumeration order (PCI bus ID order) of GPUs for sorting,
968
 * even when the GPU does not support nvlinks. E.g. for three total GPUs, their
969
 * links strings would look like this:
970
 *
971
 * GPU at index 0: -1,0,0
972
 * GPU at index 1: 0,-1,0
973
 * GPU at index 2: -0,0,-1
974
 */
975
extern char *gres_links_create_empty(unsigned int index,
976
				     unsigned int device_count)
977
{
978
	char *links_str = NULL;
979
980
	for (unsigned int i = 0; i < device_count; ++i) {
981
		xstrfmtcat(links_str, "%s%d",
982
			   i ? "," : "",
983
			   (i == index) ? -1 : 0);
984
	}
985
986
	return links_str;
987
}
988
962
/*
989
/*
963
 * Check that we have a comma-delimited list of numbers, and return the index of
990
 * Check that we have a comma-delimited list of numbers, and return the index of
964
 * the GPU (-1) in the links string.
991
 * the GPU (-1) in the links string.
(-)a/src/common/gres.h (+16 lines)
Lines 1454-1459 extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt, Link Here
1454
			     bitstr_t *cpu_aff_mac_bitstr, char *device_file,
1454
			     bitstr_t *cpu_aff_mac_bitstr, char *device_file,
1455
			     char *type, char *links);
1455
			     char *type, char *links);
1456
1456
1457
/*
1458
 * Create and return a comma-separated zeroed-out links string with a -1 in the
1459
 * given GPU position indicated by index. Caller must xfree() the returned
1460
 * string.
1461
 *
1462
 * Used to record the enumeration order (PCI bus ID order) of GPUs for sorting,
1463
 * even when the GPU does not support nvlinks. E.g. for three total GPUs, their
1464
 * links strings would look like this:
1465
 *
1466
 * GPU at index 0: -1,0,0
1467
 * GPU at index 1: 0,-1,0
1468
 * GPU at index 2: 0,0,-1
1469
 */
1470
extern char *gres_links_create_empty(unsigned int index,
1471
				     unsigned int device_count);
1472
1457
/*
1473
/*
1458
 * Check that we have a comma-delimited list of numbers, and return the index of
1474
 * Check that we have a comma-delimited list of numbers, and return the index of
1459
 * the GPU (-1) in the links string.
1475
 * the GPU (-1) in the links string.
(-)a/src/plugins/gpu/rsmi/gpu_rsmi.c (-4 / +7 lines)
Lines 1068-1074 static List _get_system_gpu_list_rsmi(node_config_load_t *node_config) Link Here
1068
	// Loop through all the GPUs on the system and add to gres_list_system
1068
	// Loop through all the GPUs on the system and add to gres_list_system
1069
	for (i = 0; i < device_count; ++i) {
1069
	for (i = 0; i < device_count; ++i) {
1070
		unsigned int minor_number = 0;
1070
		unsigned int minor_number = 0;
1071
		char *device_file = NULL;
1071
		char *device_file = NULL, *links = NULL;
1072
		char device_name[RSMI_STRING_BUFFER_SIZE] = {0};
1072
		char device_name[RSMI_STRING_BUFFER_SIZE] = {0};
1073
		char device_brand[RSMI_STRING_BUFFER_SIZE] = {0};
1073
		char device_brand[RSMI_STRING_BUFFER_SIZE] = {0};
1074
		rsmiPciInfo_t pci_info;
1074
		rsmiPciInfo_t pci_info;
Lines 1082-1087 static List _get_system_gpu_list_rsmi(node_config_load_t *node_config) Link Here
1082
		_rsmi_get_device_pci_info(i, &pci_info);
1082
		_rsmi_get_device_pci_info(i, &pci_info);
1083
		_rsmi_get_device_unique_id(i, &uuid);
1083
		_rsmi_get_device_unique_id(i, &uuid);
1084
1084
1085
		/* Use links to record PCI bus ID order */
1086
		links = gres_links_create_empty(i, device_count);
1087
1085
		xstrfmtcat(device_file, "/dev/dri/renderD%u", minor_number);
1088
		xstrfmtcat(device_file, "/dev/dri/renderD%u", minor_number);
1086
1089
1087
		debug2("GPU index %u:", i);
1090
		debug2("GPU index %u:", i);
Lines 1091-1096 static List _get_system_gpu_list_rsmi(node_config_load_t *node_config) Link Here
1091
		debug2("    PCI Domain/Bus/Device/Function: %u:%u:%u.%u",
1094
		debug2("    PCI Domain/Bus/Device/Function: %u:%u:%u.%u",
1092
		       pci_info.domain,
1095
		       pci_info.domain,
1093
		       pci_info.bus, pci_info.device, pci_info.function);
1096
		       pci_info.bus, pci_info.device, pci_info.function);
1097
		debug2("    Links: %s", links);
1094
		debug2("    Device File (minor number): %s", device_file);
1098
		debug2("    Device File (minor number): %s", device_file);
1095
		if (minor_number != i+128)
1099
		if (minor_number != i+128)
1096
			debug("Note: GPU index %u is different from minor # %u",
1100
			debug("Note: GPU index %u is different from minor # %u",
Lines 1101-1109 static List _get_system_gpu_list_rsmi(node_config_load_t *node_config) Link Here
1101
1105
1102
		add_gres_to_list(gres_list_system, "gpu", 1,
1106
		add_gres_to_list(gres_list_system, "gpu", 1,
1103
				 node_config->cpu_cnt, NULL, NULL,
1107
				 node_config->cpu_cnt, NULL, NULL,
1104
				 device_file, device_brand, NULL);
1108
				 device_file, device_brand, links);
1105
1109
1106
		xfree(device_file);
1110
		xfree(device_file);
1111
		xfree(links);
1107
	}
1112
	}
1108
1113
1109
	rsmi_shut_down();
1114
	rsmi_shut_down();
1110
- 
1111
CUDA_VISIBLE_DEVICES
1115
CUDA_VISIBLE_DEVICES
1112
--
1113
src/plugins/gres/common/gres_common.c | 18 +++++++++++++++---
1116
src/plugins/gres/common/gres_common.c | 18 +++++++++++++++---
1114
src/plugins/gres/common/gres_common.h |  2 +-
1117
src/plugins/gres/common/gres_common.h |  2 +-
1115
src/plugins/gres/gpu/gres_gpu.c       |  4 ++--
1118
src/plugins/gres/gpu/gres_gpu.c       |  4 ++--
1116
src/plugins/gres/mps/gres_mps.c       |  2 +-
1119
src/plugins/gres/mps/gres_mps.c       |  2 +-
1117
src/plugins/gres/nic/gres_nic.c       |  2 +-
1120
src/plugins/gres/nic/gres_nic.c       |  2 +-
1118
5 files changed, 20 insertions(+), 8 deletions(-)
1121
5 files changed, 20 insertions(+), 8 deletions(-)
(-)a/src/plugins/gres/common/gres_common.c (-3 / +15 lines)
Lines 195-201 extern void common_gres_set_env(List gres_devices, char ***env_ptr, Link Here
195
				int *local_inx, uint64_t *gres_per_node,
195
				int *local_inx, uint64_t *gres_per_node,
196
				char **local_list, char **global_list,
196
				char **local_list, char **global_list,
197
				bool reset, bool is_job, int *global_id,
197
				bool reset, bool is_job, int *global_id,
198
				gres_internal_flags_t flags)
198
				gres_internal_flags_t flags, bool use_dev_num)
199
{
199
{
200
	int first_inx = -1;
200
	int first_inx = -1;
201
	bitstr_t *bit_alloc = NULL;
201
	bitstr_t *bit_alloc = NULL;
Lines 259-269 extern void common_gres_set_env(List gres_devices, char ***env_ptr, Link Here
259
		itr = list_iterator_create(gres_devices);
259
		itr = list_iterator_create(gres_devices);
260
		while ((gres_device = list_next(itr))) {
260
		while ((gres_device = list_next(itr))) {
261
			int index;
261
			int index;
262
			int global_env_index;
262
			if (!bit_test(bit_alloc, gres_device->index))
263
			if (!bit_test(bit_alloc, gres_device->index))
263
				continue;
264
				continue;
264
265
266
			/*
267
			 * NICs want env to match the dev_num parsed from the
268
			 * file name; GPUs, however, want it to match the order
269
			 * they enumerate on the PCI bus, and this isn't always
270
			 * the same order as the device file names
271
			 */
272
			if (use_dev_num)
273
				global_env_index = gres_device->dev_num;
274
			else
275
				global_env_index = gres_device->index;
276
265
			index = use_local_dev_index ?
277
			index = use_local_dev_index ?
266
				(*local_inx)++ : gres_device->dev_num;
278
				(*local_inx)++ : global_env_index;
267
279
268
			if (reset) {
280
			if (reset) {
269
				if (!first_device) {
281
				if (!first_device) {
Lines 288-294 extern void common_gres_set_env(List gres_devices, char ***env_ptr, Link Here
288
			//info("looking at %d and %d",
300
			//info("looking at %d and %d",
289
			//     gres_device->index, gres_device->dev_num);
301
			//     gres_device->index, gres_device->dev_num);
290
			xstrfmtcat(new_global_list, "%s%s%d", global_prefix,
302
			xstrfmtcat(new_global_list, "%s%s%d", global_prefix,
291
				   prefix, gres_device->dev_num);
303
				   prefix, global_env_index);
292
			global_prefix = ",";
304
			global_prefix = ",";
293
		}
305
		}
294
		list_iterator_destroy(itr);
306
		list_iterator_destroy(itr);
(-)a/src/plugins/gres/common/gres_common.h (-1 / +1 lines)
Lines 69-75 extern void common_gres_set_env(List gres_devices, char ***env_ptr, Link Here
69
				int *local_inx, uint64_t *gres_per_node,
69
				int *local_inx, uint64_t *gres_per_node,
70
				char **local_list, char **global_list,
70
				char **local_list, char **global_list,
71
				bool reset, bool is_job, int *global_id,
71
				bool reset, bool is_job, int *global_id,
72
				gres_internal_flags_t flags);
72
				gres_internal_flags_t flags, bool use_dev_num);
73
73
74
/* Send GRES information from slurmd on the specified file descriptor */
74
/* Send GRES information from slurmd on the specified file descriptor */
75
extern void common_send_stepd(Buf buffer, List gres_devices);
75
extern void common_send_stepd(Buf buffer, List gres_devices);
(-)a/src/plugins/gres/gpu/gres_gpu.c (-2 / +2 lines)
Lines 124-130 static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, Link Here
124
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
124
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
125
			    usable_gres, "", local_inx,  NULL,
125
			    usable_gres, "", local_inx,  NULL,
126
			    &local_list, &global_list, reset, is_job, NULL,
126
			    &local_list, &global_list, reset, is_job, NULL,
127
			    flags);
127
			    flags, false);
128
128
129
	if (global_list) {
129
	if (global_list) {
130
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
130
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
Lines 1031-1037 extern void epilog_set_env(char ***epilog_env_ptr, Link Here
1031
			i++;
1031
			i++;
1032
			if (i == dev_inx) {
1032
			if (i == dev_inx) {
1033
				xstrfmtcat(dev_num_str, "%s%d",
1033
				xstrfmtcat(dev_num_str, "%s%d",
1034
					   sep,gres_device->dev_num);
1034
					   sep, gres_device->index);
1035
				sep = ",";
1035
				sep = ",";
1036
				break;
1036
				break;
1037
			}
1037
			}
(-)a/src/plugins/gres/mps/gres_mps.c (-1 / +1 lines)
Lines 602-608 static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, Link Here
602
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
602
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
603
			    usable_gres, "", local_inx,
603
			    usable_gres, "", local_inx,
604
			    &gres_per_node, &local_list, &global_list,
604
			    &gres_per_node, &local_list, &global_list,
605
			    reset, is_job, &global_id, flags);
605
			    reset, is_job, &global_id, flags, true);
606
606
607
	if (perc_env) {
607
	if (perc_env) {
608
		env_array_overwrite(env_ptr,
608
		env_array_overwrite(env_ptr,
(-)a/src/plugins/gres/nic/gres_nic.c (-3 / +1 lines)
Lines 110-116 static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, Link Here
110
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
110
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
111
			    usable_gres, "mlx4_", local_inx, NULL,
111
			    usable_gres, "mlx4_", local_inx, NULL,
112
			    &local_list, &global_list, reset, is_job, NULL,
112
			    &local_list, &global_list, reset, is_job, NULL,
113
			    flags);
113
			    flags, true);
114
114
115
	if (global_list) {
115
	if (global_list) {
116
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
116
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
117
- 
118
20.11
117
20.11
119
--
120
src/plugins/gres/mic/gres_mic.c | 2 +-
118
src/plugins/gres/mic/gres_mic.c | 2 +-
121
1 file changed, 1 insertion(+), 1 deletion(-)
119
1 file changed, 1 insertion(+), 1 deletion(-)
(-)a/src/plugins/gres/mic/gres_mic.c (-2 / +1 lines)
Lines 112-118 static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, Link Here
112
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
112
	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
113
			    usable_gres, "", local_inx, NULL,
113
			    usable_gres, "", local_inx, NULL,
114
			    &local_list, &global_list, reset, is_job, NULL,
114
			    &local_list, &global_list, reset, is_job, NULL,
115
			    flags);
115
			    flags, true);
116
116
117
	if (global_list) {
117
	if (global_list) {
118
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
118
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
119
- 

Return to ticket 12975