When submitting multiple jobs, stop the slurmctld service, and there will be a certain probability of generating a core file. The content of the core file is as follows: (gdb) bt #0 0x00007f268896b207 in raise () from /lib64/libc.so.6 #1 0x00007f268896c8f8 in abort () from /lib64/libc.so.6 #2 0x00007f26897eadba in __xassert_failed (expr=expr@entry=0x7f2689819a9d "persist_conn", file=file@entry=0x7f2689819a3a "slurm_persist_conn.c", line=line@entry=1017, func=func@entry=0x7f268981a5a0 <__func__.17620> "slurm_persist_msg_pack") at xassert.c:57 #3 0x00007f268974049f in slurm_persist_msg_pack (persist_conn=<optimized out>, req_msg=req_msg@entry=0x7f267691acc0) at slurm_persist_conn.c:1017 #4 0x00007f26860503ec in slurmdbd_agent_send (rpc_version=rpc_version@entry=9216, req=req@entry=0x7f267691acc0) at slurmdbd_agent.c:921 #5 0x00007f268604be1b in jobacct_storage_p_job_complete (db_conn=0x1f516c0, job_ptr=0x204c980) at accounting_storage_slurmdbd.c:2760 #6 0x00007f2689721465 in jobacct_storage_g_job_complete (db_conn=0x1f516c0, job_ptr=job_ptr@entry=0x204c980) at slurm_accounting_storage.c:891 #7 0x0000000000457f89 in job_completion_logger (job_ptr=job_ptr@entry=0x204c980, requeue=requeue@entry=true) at job_mgr.c:16497 #8 0x000000000045ceb7 in _job_complete (job_ptr=job_ptr@entry=0x204c980, uid=uid@entry=0, requeue=requeue@entry=true, node_fail=node_fail@entry=false, job_return_code=<optimized out>, job_return_code@entry=0) at job_mgr.c:6354 #9 0x000000000045d4f0 in job_complete (job_id=job_id@entry=54113, uid=0, requeue=requeue@entry=true, node_fail=node_fail@entry=false, job_return_code=job_return_code@entry=0) at job_mgr.c:6553 #10 0x000000000042255d in _notify_slurmctld_nodes (retry_cnt=1, no_resp_cnt=<optimized out>, agent_ptr=0x7f26400008d0) at agent.c:734 #11 _wdog (args=0x7f26400008d0) at agent.c:638 #12 0x00007f2688d09dd5 in start_thread () from /lib64/libpthread.so.0 #13 0x00007f2688a32ead in clone () from /lib64/libc.so.6 It is suspected that the slurmdbd_agent_send function sends messages without adding slurm_mutex_lock(&slurmdbd_lock); lock. The code involved is as follows: extern int slurmdbd_agent_send(uint16_t rpc_version, persist_msg_t *req) { Buf buffer; uint32_t cnt, rc = SLURM_SUCCESS; static time_t syslog_time = 0; xassert(running_in_slurmctld()); xassert(slurm_conf.max_dbd_msgs); log_flag(PROTOCOL, "msg_type:%s protocol_version:%hu agent_count:%d", slurmdbd_msg_type_2_str(req->msg_type, 1), rpc_version, list_count(agent_list)); buffer = slurm_persist_msg_pack( slurmdbd_conn, (persist_msg_t *)req); if (!buffer) /* pack error */ return SLURM_ERROR; slurm_mutex_lock(&agent_lock); if ((agent_tid == 0) || (agent_list == NULL)) { _create_agent(); if ((agent_tid == 0) || (agent_list == NULL)) { slurm_mutex_unlock(&agent_lock); free_buf(buffer); return SLURM_ERROR; } } cnt = list_count(agent_list); if ((cnt >= (slurm_conf.max_dbd_msgs / 2)) && (difftime(time(NULL), syslog_time) > 120)) { /* Record critical error every 120 seconds */ syslog_time = time(NULL); error("agent queue filling (%u), MaxDBDMsgs=%u, RESTART SLURMDBD NOW", cnt, slurm_conf.max_dbd_msgs); syslog(LOG_CRIT, "*** RESTART SLURMDBD NOW ***"); (slurmdbd_conn->trigger_callbacks.dbd_fail)(); } /* Handle action */ _max_dbd_msg_action(&cnt); if (cnt < slurm_conf.max_dbd_msgs) { if (list_enqueue(agent_list, buffer) == NULL) fatal("list_enqueue: memory allocation failure"); } else { error("agent queue is full (%u), discarding %s:%u request", cnt, slurmdbd_msg_type_2_str(req->msg_type, 1), req->msg_type); (slurmdbd_conn->trigger_callbacks.acct_full)(); free_buf(buffer); rc = SLURM_ERROR; } slurm_cond_broadcast(&agent_cond); slurm_mutex_unlock(&agent_lock); return rc; }