17631 – When submitting multiple jobs, stop the slurmctld service, and there will be a certain probability of generating a core file.

Ticket 17631 - When submitting multiple jobs, stop the slurmctld service, and there will be a certain probability of generating a core file.

Summary: When submitting multiple jobs, stop the slurmctld service, and there will be ...

Status:	OPEN

Alias:	None

Product:	Slurm
Classification:	Unclassified
Component:	Accounting (show other tickets)
Version:	21.08.0
Hardware:	Linux Linux

Severity:	6 - No support contract
Assignee:	Jacob Jenson
QA Contact:

URL:

Depends on:
Blocks:

Reported:	2023-09-06 02:04 MDT by qinyunhui
Modified:	2023-09-06 02:04 MDT (History)
CC List:	0 users

See Also:
Site:	-Other-
Slinky Site:	---
Alineos Sites:	---
Atos/Eviden Sites:	---
Confidential Site:	---
Coreweave sites:	---
Cray Sites:	---
DS9 clusters:	---
Google sites:	---
HPCnow Sites:	---
HPE Sites:	---
IBM Sites:	---
NOAA SIte:	---
NoveTech Sites:	---
Nvidia HWinf-CS Sites:	---
OCF Sites:	---
Recursion Pharma Sites:	---
SFW Sites:	---
SNIC sites:	---
Tzag Elita Sites:	---
Linux Distro:	---
Machine Name:
CLE Version:
Version Fixed:
Target Release:	---
DevPrio:	---
Emory-Cloud Sites:	---

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this ticket.

Description qinyunhui 2023-09-06 02:04:41 MDT

When submitting multiple jobs, stop the slurmctld service, and there will be a certain probability of generating a core file.
The content of the core file is as follows：
(gdb) bt
#0  0x00007f268896b207 in raise () from /lib64/libc.so.6
#1  0x00007f268896c8f8 in abort () from /lib64/libc.so.6
#2  0x00007f26897eadba in __xassert_failed (expr=expr@entry=0x7f2689819a9d "persist_conn",
    file=file@entry=0x7f2689819a3a "slurm_persist_conn.c", line=line@entry=1017,
    func=func@entry=0x7f268981a5a0 <__func__.17620> "slurm_persist_msg_pack") at xassert.c:57
#3  0x00007f268974049f in slurm_persist_msg_pack (persist_conn=<optimized out>, req_msg=req_msg@entry=0x7f267691acc0)
    at slurm_persist_conn.c:1017
#4  0x00007f26860503ec in slurmdbd_agent_send (rpc_version=rpc_version@entry=9216, req=req@entry=0x7f267691acc0) at slurmdbd_agent.c:921
#5  0x00007f268604be1b in jobacct_storage_p_job_complete (db_conn=0x1f516c0, job_ptr=0x204c980) at accounting_storage_slurmdbd.c:2760
#6  0x00007f2689721465 in jobacct_storage_g_job_complete (db_conn=0x1f516c0, job_ptr=job_ptr@entry=0x204c980)
    at slurm_accounting_storage.c:891
#7  0x0000000000457f89 in job_completion_logger (job_ptr=job_ptr@entry=0x204c980, requeue=requeue@entry=true) at job_mgr.c:16497
#8  0x000000000045ceb7 in _job_complete (job_ptr=job_ptr@entry=0x204c980, uid=uid@entry=0, requeue=requeue@entry=true,
    node_fail=node_fail@entry=false, job_return_code=<optimized out>, job_return_code@entry=0) at job_mgr.c:6354
#9  0x000000000045d4f0 in job_complete (job_id=job_id@entry=54113, uid=0, requeue=requeue@entry=true, node_fail=node_fail@entry=false,
    job_return_code=job_return_code@entry=0) at job_mgr.c:6553
#10 0x000000000042255d in _notify_slurmctld_nodes (retry_cnt=1, no_resp_cnt=<optimized out>, agent_ptr=0x7f26400008d0) at agent.c:734
#11 _wdog (args=0x7f26400008d0) at agent.c:638
#12 0x00007f2688d09dd5 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f2688a32ead in clone () from /lib64/libc.so.6

It is suspected that the slurmdbd_agent_send function sends messages without adding slurm_mutex_lock(&slurmdbd_lock); lock.
The code involved is as follows：


extern int slurmdbd_agent_send(uint16_t rpc_version, persist_msg_t *req)
{
	Buf buffer;
	uint32_t cnt, rc = SLURM_SUCCESS;
	static time_t syslog_time = 0;

	xassert(running_in_slurmctld());
	xassert(slurm_conf.max_dbd_msgs);

	log_flag(PROTOCOL, "msg_type:%s protocol_version:%hu agent_count:%d",
		 slurmdbd_msg_type_2_str(req->msg_type, 1),
		 rpc_version, list_count(agent_list));

	buffer = slurm_persist_msg_pack(
		slurmdbd_conn, (persist_msg_t *)req);
	if (!buffer)	/* pack error */
		return SLURM_ERROR;

	slurm_mutex_lock(&agent_lock);
	if ((agent_tid == 0) || (agent_list == NULL)) {
		_create_agent();
		if ((agent_tid == 0) || (agent_list == NULL)) {
			slurm_mutex_unlock(&agent_lock);
			free_buf(buffer);
			return SLURM_ERROR;
		}
	}
	cnt = list_count(agent_list);
	if ((cnt >= (slurm_conf.max_dbd_msgs / 2)) &&
	    (difftime(time(NULL), syslog_time) > 120)) {
		/* Record critical error every 120 seconds */
		syslog_time = time(NULL);
		error("agent queue filling (%u), MaxDBDMsgs=%u, RESTART SLURMDBD NOW",
		      cnt, slurm_conf.max_dbd_msgs);
		syslog(LOG_CRIT, "*** RESTART SLURMDBD NOW ***");
		(slurmdbd_conn->trigger_callbacks.dbd_fail)();
	}

	/* Handle action */
	_max_dbd_msg_action(&cnt);

	if (cnt < slurm_conf.max_dbd_msgs) {
		if (list_enqueue(agent_list, buffer) == NULL)
			fatal("list_enqueue: memory allocation failure");
	} else {
		error("agent queue is full (%u), discarding %s:%u request",
		      cnt,
		      slurmdbd_msg_type_2_str(req->msg_type, 1),
		      req->msg_type);
		(slurmdbd_conn->trigger_callbacks.acct_full)();
		free_buf(buffer);
		rc = SLURM_ERROR;
	}

	slurm_cond_broadcast(&agent_cond);
	slurm_mutex_unlock(&agent_lock);
	return rc;
}