Hello, Got a new segfault, this time it's preventing us from starting slurm up. (gdb) bt #0 _free_step_rec (step_ptr=0x308bfe0) at step_mgr.c:313 #1 0x00000000004d8b7c in delete_step_record (job_ptr=0x305d960, step_id=4294967295) at step_mgr.c:374 #2 0x00007f0cc6dc40f9 in _step_fini (args=0x308bfe0) at select_cray.c:1173 #3 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 (gdb) (gdb) thread apply all bt Thread 14 (Thread 0x7f0c3e5e5700 (LWP 5197)): #0 0x00007f0cc7fca5c9 in do_sigwait () from /lib64/libpthread.so.0 #1 0x00007f0cc7fca653 in sigwait () from /lib64/libpthread.so.0 #2 0x0000000000445010 in _slurmctld_signal_hand (no_data=0x0) at controller.c:876 #3 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 13 (Thread 0x7f0cb9fc9700 (LWP 5077)): #0 0x00007f0cc7fc44c2 in pthread_join () from /lib64/libpthread.so.0 #1 0x00007f0cba0d1d54 in _cleanup_thread (no_data=0x0) at priority_multifactor.c:1423 #2 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 12 (Thread 0x7f0c3e4e4700 (LWP 5198)): #0 0x00007f0cc7fc705f in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000004d7842 in slurmctld_state_save (no_data=0x0) at state_save.c:208 #2 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 11 (Thread 0x7f0cc85ed700 (LWP 4041)): #0 0x00007f0cc7ccaded in nanosleep () from /lib64/libc.so.6 #1 0x00007f0cc7cf29d4 in usleep () from /lib64/libc.so.6 #2 0x0000000000447212 in _slurmctld_background (no_data=0x0) at controller.c:1713 #3 0x0000000000444940 in main (argc=1, argv=0x7fffd3349398) at controller.c:605 Thread 10 (Thread 0x7f0cc184d700 (LWP 4045)): #0 0x00007f0cc7fc7408 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000005c0b07 in _agent (x=0x0) at slurmdbd_defs.c:2137 #2 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 9 (Thread 0x7f0cbac05700 (LWP 5063)): #0 0x00007f0cc7cf0c1d in poll () from /lib64/libc.so.6 #1 0x00007f0cc6dc2319 in _aeld_event_loop (args=0x0) at select_cray.c:516 #2 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 8 (Thread 0x7f0cc0d14700 (LWP 5047)): #0 0x00007f0cc7fc7408 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f0cc0d19585 in _my_sleep (usec=120000000) at backfill.c:488 #2 0x00007f0cc0d19fdf in backfill_agent (args=0x0) at backfill.c:742 #3 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 7 (Thread 0x7f0cc85eb700 (LWP 4042)): #0 0x00007f0cc7ccaded in nanosleep () from /lib64/libc.so.6 #1 0x00007f0cc7ccac84 in sleep () from /lib64/libc.so.6 #2 0x00007f0cc28f75ef in _lease_extender (args=0x0) at cookies.c:350 #3 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 ---Type <return> to continue, or q <return> to quit--- Thread 6 (Thread 0x7f0cc1b52700 (LWP 4044)): #0 0x00007f0cc7fc44c2 in pthread_join () from /lib64/libpthread.so.0 #1 0x00007f0cc1c57dfc in _cleanup_thread (no_data=0x0) at accounting_storage_slurmdbd.c:443 #2 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 5 (Thread 0x7f0cba0ca700 (LWP 5076)): #0 0x00007f0cc7ccaded in nanosleep () from /lib64/libc.so.6 #1 0x00007f0cc7ccac84 in sleep () from /lib64/libc.so.6 #2 0x00007f0cba0d1c13 in _decay_thread (no_data=0x0) at priority_multifactor.c:1369 #3 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 4 (Thread 0x7f0cc1c53700 (LWP 4043)): #0 0x00007f0cc7cf0c1d in poll () from /lib64/libc.so.6 #1 0x000000000058d170 in _slurm_connect (__fd=7, __addr=0x7f0cc1c52e20, __len=16) at slurm_protocol_socket_implementation.c:604 #2 0x000000000058cf04 in slurm_open_stream (addr=0x7f0cc1c52e20, retry=false) at slurm_protocol_socket_implementation.c:509 #3 0x000000000054900d in slurm_open_msg_conn (slurm_address=0x7f0cc1c52e20) at slurm_protocol_api.c:2917 #4 0x00000000005bd97e in _open_slurmdbd_fd (need_db=true) at slurmdbd_defs.c:436 #5 0x00000000005bd470 in slurm_send_recv_slurmdbd_msg (rpc_version=7680, req=0x7f0cc1c52f20, resp=0x7f0cc1c52f30) at slurmdbd_defs.c:305 #6 0x00007f0cc1c57aec in _set_db_inx_thread (no_data=0x0) at accounting_storage_slurmdbd.c:352 #7 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #8 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 3 (Thread 0x7f0c3e6e6700 (LWP 5196)): #0 0x00007f0cc7cf22b3 in select () from /lib64/libc.so.6 #1 0x00000000004456c0 in _slurmctld_rpc_mgr (no_data=0x0) at controller.c:1014 #2 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 2 (Thread 0x7f0cba3d8700 (LWP 5075)): #0 0x00007f0cc7fc7408 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f0cba5fb2db in bb_sleep (state_ptr=0x7f0cba801880 <bb_state>, add_secs=30) at burst_buffer_common.c:926 #2 0x00007f0cba5e917b in _bb_agent (args=0x0) at burst_buffer_cray.c:413 #3 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7f0cbaa03700 (LWP 5065)): #0 _free_step_rec (step_ptr=0x308bfe0) at step_mgr.c:313 #1 0x00000000004d8b7c in delete_step_record (job_ptr=0x305d960, step_id=4294967295) at step_mgr.c:374 #2 0x00007f0cc6dc40f9 in _step_fini (args=0x308bfe0) at select_cray.c:1173 #3 0x00007f0cc7fc30a4 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f0cc7cf904d in clone () from /lib64/libc.so.6 (gdb) (gdb) print *step_ptr $1 = { batch_step = 0, ckpt_interval = 0, check_job = 0x0, ckpt_dir = 0x0, ckpt_time = 0, core_bitmap_job = 0x0, cpu_count = 0, cpu_freq_min = 0, cpu_freq_max = 0, cpu_freq_gov = 0, cpus_per_task = 0, cyclic_alloc = 0, exclusive = 0, exit_code = 4294967295, exit_node_bitmap = 0x308c2a0, ext_sensors = 0x308bda0, gres = 0x0, gres_list = 0x0, host = 0x0, job_ptr = 0x305d960, jobacct = 0x308c120, pn_min_memory = 0, name = 0x305a0d0 "extern", network = 0x0, no_kill = 0 '\000', port = 0, pre_sus_time = 0, start_protocol_ver = 7680, resv_port_array = 0x0, resv_port_cnt = 0, resv_ports = 0x0, requid = 4294967295, start_time = 1469026625, time_limit = 4294967295, select_jobinfo = 0x306bce0, state = 1, step_id = 4294967295, step_layout = 0x0, step_node_bitmap = 0x0, switch_job = 0x308be50, time_last_active = 0, tot_sus_time = 0, tres_alloc_str = 0x308bd60 "1=96000,2=187392000,4=1500", tres_fmt_alloc_str = 0x0 } (gdb) Will keep investigating. Tried using cleaner patch, but it crashes similarly. -Doug
Hey Doug, this patch should get you going... diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 3112a71..6ce90a5 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -310,8 +310,10 @@ static void _free_step_rec(struct step_record *step_ptr) * and not upon record purging. Presently both events occur * simultaneously. */ if (step_ptr->switch_job) { - switch_g_job_step_complete(step_ptr->switch_job, - step_ptr->step_layout->node_list); + if (step_ptr->step_layout) + switch_g_job_step_complete( + step_ptr->switch_job, + step_ptr->step_layout->node_list); switch_g_free_jobinfo (step_ptr->switch_job); } resv_port_free(step_ptr); I am working on a for real fix right now which this will be part of.
I was able to patch around this using a previous extension to the cleaner patch. Was able to get slurmctld to not crash (at least the cleaner run-once version). Now having issues getting a job to run (failure to configure interconnect), also DW is failing to talk; we're looking to see if there are network issues. Will update soon. -Doug
Doug the commit on this is 71ddc0a5730. Please reopen if this still happens afterwards. Sounds like you are having other issues not related to this though.
OK, we've gotten through this using a doubly-patched cleaner slurmctld. There are some residual issues and active steps i'll start pulling apart, will get those filed once we're back in action. -Doug
sounds good.
*** Ticket 2935 has been marked as a duplicate of this ticket. ***