Created attachment 9437 [details]
cgroup config
Created attachment 9438 [details]
slurmctld log
Created attachment 9439 [details]
slurmd log
Davide
Is the slurmstepd process (or the job processes) in D state when you look at it with ps?
> ps -ef
--Nate
Nate,
Here is the output you requested:
# ps -ef | grep 6892778
root 32870 25984 0 10:25 pts/0 00:00:00 grep --color=auto 6892778
root 34912 1 0 Mar03 ? 00:00:09 slurmstepd: [6892778.extern]
Here is a strace output from that process:
# strace -p 34912
strace: Process 34912 attached
restart_syscall(<... resuming interrupted nanosleep ...>) = 0
open("/etc/hosts", O_RDONLY|O_CLOEXEC) = 9
fstat(9, {st_mode=S_IFREG|0644, st_size=25203, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b1f8eed1000
read(9, "10.2.192.17 mako-nsd8\n10.2.1"..., 4096) = 4096
read(9, "64\trokasgate2\trokasgate2.vampire"..., 4096) = 4096
read(9, "\tcn1485.vampire\n10.0.14.76\tcn148"..., 4096) = 4096
read(9, "vampire\n10.0.4.85\tcn1085\tcn1085."..., 4096) = 4096
read(9, "5\tcn395.vampire\n10.0.24.96\tcn396"..., 4096) = 4096
read(9, "\tgpu0020.vampire\n10.0.32.166\tcn1"..., 4096) = 4096
read(9, "10.0.24.18\tcn318\tcn318.vampire\n1"..., 4096) = 627
read(9, "", 4096) = 0
close(9) = 0
munmap(0x2b1f8eed1000, 4096) = 0
socket(AF_INET, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_IP) = 9
connect(9, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.0.0.1")}, 16) = 0
poll([{fd=9, events=POLLOUT}], 1, 0) = 1 ([{fd=9, revents=POLLOUT}])
sendto(9, "\216\221\1\0\0\1\0\0\0\0\0\0\tsched-vip\7vampire\0\0"..., 35, MSG_NOSIGNAL, NULL, 0) = 35
poll([{fd=9, events=POLLIN}], 1, 1000) = 1 ([{fd=9, revents=POLLIN}])
ioctl(9, FIONREAD, [126]) = 0
recvfrom(9, "\216\221\205\200\0\1\0\1\0\2\0\2\tsched-vip\7vampire\0\0"..., 1024, 0, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.0.0.1")}, [16]) = 126
close(9) = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 9
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR|O_NONBLOCK) = 0
connect(9, {sa_family=AF_INET, sin_port=htons(6817), sin_addr=inet_addr("10.0.23.105")}, 16) = -1 EINPROGRESS (Operation now in progress)
poll([{fd=9, events=POLLIN|POLLOUT}], 1, 2000) = 0 (Timeout)
close(9) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [HUP INT QUIT USR1 USR2 PIPE ALRM TERM TSTP], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [HUP INT QUIT USR1 USR2 PIPE ALRM TERM TSTP], NULL, 8) = 0
nanosleep({1, 0}, 0x7ffd18822960) = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 9
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR|O_NONBLOCK) = 0
connect(9, {sa_family=AF_INET, sin_port=htons(6817), sin_addr=inet_addr("10.0.23.105")}, 16) = -1 EINPROGRESS (Operation now in progress)
poll([{fd=9, events=POLLIN|POLLOUT}], 1, 2000) = 1 ([{fd=9, revents=POLLOUT}])
getsockopt(9, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
fcntl(9, F_SETFL, O_RDWR) = 0
fcntl(9, F_SETFD, FD_CLOEXEC) = 0
rt_sigaction(SIGALRM, {SIG_DFL, [ALRM], SA_RESTORER, 0x2b1f8fdf65e0}, {SIG_DFL, [ALRM], SA_RESTORER, 0x2b1f8fdf65e0}, 8) = 0
stat("/var/run/munge/munge.socket.2", {st_mode=S_IFSOCK|0777, st_size=0, ...}) = 0
socket(AF_LOCAL, SOCK_STREAM, 0) = 15
fcntl(15, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(15, F_SETFL, O_RDWR|O_NONBLOCK) = 0
connect(15, {sa_family=AF_LOCAL, sun_path="/var/run/munge/munge.socket.2"}, 110) = 0
writev(15, [{"\0`mK\4\2\0\0\0\0\24", 11}, {"\1\1\1\0\0\0\0\0\377\377\377\377\377\377\377\377\0\0\0\0", 20}], 2) = 31
read(15, 0x7ffd18822930, 11) = -1 EAGAIN (Resource temporarily unavailable)
poll([{fd=15, events=POLLIN}], 1, 3000) = 1 ([{fd=15, revents=POLLIN|POLLHUP}])
read(15, "\0`mK\4\3\0\0\0\0\206", 11) = 11
read(15, "\0\0\0\0\0\200MUNGE:AwQDAACm9AF8rBQf6CM2"..., 134) = 134
close(15) = 0
rt_sigaction(SIGALRM, {SIG_DFL, [ALRM], SA_RESTORER, 0x2b1f8fdf65e0}, {SIG_DFL, [ALRM], SA_RESTORER, 0x2b1f8fdf65e0}, 8) = 0
rt_sigaction(SIGPIPE, {SIG_IGN, [PIPE], SA_RESTORER, 0x2b1f8fdf65e0}, {SIG_IGN, [PIPE], SA_RESTORER, 0x2b1f8fdf65e0}, 8) = 0
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR|O_NONBLOCK) = 0
poll([{fd=9, events=POLLOUT}], 1, 20000) = 1 ([{fd=9, revents=POLLOUT}])
recvfrom(9, 0x7ffd188229e0, 1, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
sendto(9, "\0\0\0052", 4, 0, NULL, 0) = 4
fcntl(9, F_SETFL, O_RDWR) = 0
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR|O_NONBLOCK) = 0
poll([{fd=9, events=POLLOUT}], 1, 20000) = 1 ([{fd=9, revents=POLLOUT}])
recvfrom(9, 0x7ffd188229e0, 1, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
sendto(9, "!\0\0\0\0\0\23\230\0\0\4\205\0\0\0\0\0\0\0\0\0\0\0\0\0\vauth/m"..., 1330, 0, NULL, 0) = 1330
fcntl(9, F_SETFL, O_RDWR) = 0
rt_sigaction(SIGPIPE, {SIG_IGN, [PIPE], SA_RESTORER, 0x2b1f8fdf65e0}, {SIG_IGN, [PIPE], SA_RESTORER, 0x2b1f8fdf65e0}, 8) = 0
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR|O_NONBLOCK) = 0
poll([{fd=9, events=POLLIN}], 1, 20000) = 0 (Timeout)
fcntl(9, F_SETFL, O_RDWR) = 0
close(9) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [HUP INT QUIT USR1 USR2 PIPE ALRM TERM TSTP], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [HUP INT QUIT USR1 USR2 PIPE ALRM TERM TSTP], NULL, 8) = 0
nanosleep({60, 0},
(In reply to Davide Vanzo from comment #5) > Nate, > > Here is the output you requested: Looking at your logs now. (In reply to Nate Rini from comment #7) > (In reply to Davide Vanzo from comment #5) > > Nate, > > > > Here is the output you requested: > > Looking at your logs now. Looks like your node is hung, probably due to oom: > [2019-03-05T08:14:51.009] [6909292.extern] _oom_event_monitor: oom-kill event count: 1 > [2019-03-05T08:15:14.735] [6909284.extern] Sent SIGKILL signal to 6909284.4294967295 Can you please provide the "ps -ef" for the job processes. Can you also try to 'kill -9 $JOB_PIDS'? If SIGKILL isn't working, then there is problem with that node's kernel and I would suggest that node should be core dumped and sent to your systems integrator. Looks like your nodes are effectively DOSing the slurmctld with steps. > [2019-03-05T09:16:17.522] sched: 256 pending RPCs at cycle end, consider configuring max_rpc_cnt > [2019-03-05T09:16:17.522] step_partial_comp: JobId=6915560 StepID=4294967295 invalid > [2019-03-05T09:16:17.522] server_thread_count over limit (256), waiting Can you please provide the output of this: > sdiag Created attachment 9441 [details]
ps output
(In reply to Davide Vanzo from comment #9) > Created attachment 9441 [details] > ps output Can we also get? > ps auxfT Created attachment 9442 [details]
ps auxfT output
Nate, Attached you can find the output you requested. I cannot provide the output of sdiag since it fails with socket timeout. DV As for killing the slurmstepd process, "kill -9" works as expected. We are actually going through all the nodes and kill the stuck slurmstepd this way in the hope that this would help getting a responsive controller. DV This should not be happening: > root 31435 0.0 0.0 268352 3824 ? Sl Mar03 0:09 slurmstepd: [6892541.batch] > root 29526 0.0 0.0 0 0 ? Z 10:12 0:00 \_ [sh] <defunct> The strace shows the daemon is getting stuck trying to update (probably send steps updates) to the slurmctld. (In reply to Davide Vanzo from comment #12) > Attached you can find the output you requested. > I cannot provide the output of sdiag since it fails with socket timeout. Please restart your slurmctld process. I have restarted it but the situation remains the same. (In reply to Davide Vanzo from comment #15) > I have restarted it but the situation remains the same. Then please kill all of the slurmstepd that have zombie children. Also please try to call sdiag a few times to see if it starts responding. The other problem is that killing the stuck slurmstepd on the compute nodes is not helping since slurmctld keeps sending TERMINATE_JOB requests for that same job. That's because the slurmd cannot communicate back to the controller. If I try to restart the slurmd, it is unable to register with the controller because of socket timeout. This persists even after restarting slurmctld. Here is an sdiag output:
*******************************************************
sdiag output at Tue Mar 05 11:19:38 2019 (1551806378)
Data since Tue Mar 05 11:19:30 2019 (1551806370)
*******************************************************
Server thread count: 256
Agent queue size: 481
Agent count: 2
DBD Agent queue size: 2
Jobs submitted: 0
Jobs started: 0
Jobs completed: 1
Jobs canceled: 0
Jobs failed: 0
Job states ts: Tue Mar 05 11:19:32 2019 (1551806372)
Jobs pending: 14130
Jobs running: 2511
Main schedule statistics (microseconds):
Last cycle: 1296031
Max cycle: 1380003
Total cycles: 7
Mean cycle: 1307038
Mean depth cycle: 72
Last queue length: 4446
Backfilling stats
Total backfilled jobs (since last slurm start): 0
Total backfilled jobs (since last stats cycle start): 0
Total backfilled heterogeneous job components: 0
Total cycles: 0
Last cycle when: Wed Dec 31 18:00:00 1969 (0)
Last cycle: 0
Max cycle: 0
Last depth cycle: 0
Last depth cycle (try sched): 0
Last queue length: 0
Latency for gettimeofday() (x1000): 24 microseconds
Remote Procedure Call statistics by message type
REQUEST_FED_INFO ( 2049) count:19 ave_time:157 total_time:2986
REQUEST_COMPLETE_BATCH_SCRIPT ( 5018) count:11 ave_time:3444720 total_time:37891928
REQUEST_STEP_COMPLETE ( 5016) count:11 ave_time:4001379 total_time:44015171
REQUEST_JOB_INFO ( 2003) count:10 ave_time:530466 total_time:5304664
MESSAGE_EPILOG_COMPLETE ( 6012) count:7 ave_time:4953337 total_time:34673362
REQUEST_JOB_INFO_SINGLE ( 2021) count:6 ave_time:8683682 total_time:52102092
REQUEST_STATS_INFO ( 2035) count:2 ave_time:224 total_time:448
REQUEST_SUBMIT_BATCH_JOB ( 4003) count:0 ave_time:0 total_time:0
REQUEST_JOB_USER_INFO ( 2039) count:0 ave_time:0 total_time:0
Remote Procedure Call statistics by user
root ( 0) count:31 ave_time:3760674 total_time:116580909
ryderkh ( 483404) count:23 ave_time:204963 total_time:4714149
wangj57 ( 674623) count:6 ave_time:5797567 total_time:34785406
ryderld ( 483348) count:3 ave_time:197703 total_time:593111
vuiis_archive_singularity( 9211) count:2 ave_time:4215487 total_time:8430975
weisszy ( 742600) count:1 ave_time:8886101 total_time:8886101
cartonf ( 762584) count:0 ave_time:0 total_time:0
cmspilot ( 9201) count:0 ave_time:0 total_time:0
yangah ( 497281) count:0 ave_time:0 total_time:0
Pending RPC statistics
REQUEST_TERMINATE_JOB ( 6011) count:491
Pending RPCs
1: REQUEST_TERMINATE_JOB cn1406
2: REQUEST_TERMINATE_JOB cn401
3: REQUEST_TERMINATE_JOB cn1394
4: REQUEST_TERMINATE_JOB cn1093
5: REQUEST_TERMINATE_JOB cn1412
6: REQUEST_TERMINATE_JOB cn1369
7: REQUEST_TERMINATE_JOB cn312
8: REQUEST_TERMINATE_JOB cn1414
9: REQUEST_TERMINATE_JOB cn1344
10: REQUEST_TERMINATE_JOB cn1342
11: REQUEST_TERMINATE_JOB cn1317
12: REQUEST_TERMINATE_JOB cn1274
13: REQUEST_TERMINATE_JOB cn1305
14: REQUEST_TERMINATE_JOB cn212
15: REQUEST_TERMINATE_JOB cn1273
16: REQUEST_TERMINATE_JOB cn1340
17: REQUEST_TERMINATE_JOB cn1310
18: REQUEST_TERMINATE_JOB cn437
19: REQUEST_TERMINATE_JOB cn391
20: REQUEST_TERMINATE_JOB cn1421
21: REQUEST_TERMINATE_JOB cn1286
22: REQUEST_TERMINATE_JOB cn354
23: REQUEST_TERMINATE_JOB ng1106
24: REQUEST_TERMINATE_JOB cn1303
25: REQUEST_TERMINATE_JOB cn1405
(In reply to Davide Vanzo from comment #17) > The other problem is that killing the stuck slurmstepd on the compute nodes > is not helping since slurmctld keeps sending TERMINATE_JOB requests for that > same job. That's because the slurmd cannot communicate back to the > controller. If I try to restart the slurmd, it is unable to register with > the controller because of socket timeout. This persists even after > restarting slurmctld. Please down all of your nodes with zombies (you may have to retry a few times): > scontrol update nodename=<node_name> state=down reason="Jobs failing to get killed" Ok, after killing all zombie `slurmctld` the controller finally got rid of all RPCs and now it seems back to normal. Do you have any idea what could have been the cause of this? Sorry, I meant `slurmstepd` in the previous message. Davide - We just wanted to follow up with you and see how things are progressing now that it has been a little over one hour. I have kept the controller under observation and now everything seems back to normal. The only unusual message is the following, although it does not seem to affect the responsiveness of the scheduler: [2019-03-05T13:20:17.981] Warning: Note very large processing time from _slurmctld_background: usec=3024622 began=13:20:14.956 (In reply to Davide Vanzo from comment #23) > I have kept the controller under observation and now everything seems back > to normal. The only unusual message is the following, although it does not > seem to affect the responsiveness of the scheduler: Lowering the severity of this ticket per your response. > [2019-03-05T13:20:17.981] Warning: Note very large processing time from > _slurmctld_background: usec=3024622 began=13:20:14.956 Can please turn up logging on slurmctld: > scontrol setdebug debug3 Please upload slurmctld logs if it happens again. --Nate Created attachment 9443 [details]
slurmctld debug3 log
Nate, I have added a 3 minutes snapshot of the slurmctld log with debug level 7. I could not keep it longer since it was causing socket timeouts again. Those warnings are now getting more frequent and the commands are showing timeouts sometimes. (In reply to Davide Vanzo from comment #26) > Nate, > > I have added a 3 minutes snapshot of the slurmctld log with debug level 7. I > could not keep it longer since it was causing socket timeouts again. > Those warnings are now getting more frequent and the commands are showing > timeouts sometimes. Can you call sdiag again? # sdiag
*******************************************************
sdiag output at Tue Mar 05 14:38:58 2019 (1551818338)
Data since Tue Mar 05 11:23:47 2019 (1551806627)
*******************************************************
Server thread count: 34
Agent queue size: 48
Agent count: 0
DBD Agent queue size: 0
Jobs submitted: 8452
Jobs started: 8610
Jobs completed: 6617
Jobs canceled: 1433
Jobs failed: 2
Job states ts: Tue Mar 05 14:38:55 2019 (1551818335)
Jobs pending: 12313
Jobs running: 3462
Main schedule statistics (microseconds):
Last cycle: 1925262
Max cycle: 5013003
Total cycles: 9544
Mean cycle: 78139
Mean depth cycle: 105
Cycles per minute: 48
Last queue length: 3808
Backfilling stats
Total backfilled jobs (since last slurm start): 3272
Total backfilled jobs (since last stats cycle start): 3272
Total backfilled heterogeneous job components: 0
Total cycles: 72
Last cycle when: Tue Mar 05 14:36:39 2019 (1551818199)
Last cycle: 67351901
Max cycle: 114584051
Mean cycle: 37879036
Last depth cycle: 3832
Last depth cycle (try sched): 168
Depth Mean: 3464
Depth Mean (try depth): 156
Last queue length: 3800
Queue length mean: 3856
Latency for gettimeofday() (x1000): 46 microseconds
Remote Procedure Call statistics by message type
REQUEST_PARTITION_INFO ( 2009) count:46832 ave_time:207 total_time:9709082
REQUEST_JOB_INFO ( 2003) count:38263 ave_time:4402897 total_time:168468073891
REQUEST_FED_INFO ( 2049) count:38198 ave_time:278 total_time:10656046
REQUEST_JOB_INFO_SINGLE ( 2021) count:10264 ave_time:4689767 total_time:48135770260
MESSAGE_EPILOG_COMPLETE ( 6012) count:9065 ave_time:4015404 total_time:36399645237
REQUEST_COMPLETE_PROLOG ( 6018) count:8565 ave_time:392335 total_time:3360351671
REQUEST_COMPLETE_BATCH_SCRIPT ( 5018) count:8147 ave_time:11137723 total_time:90739037135
REQUEST_STEP_COMPLETE ( 5016) count:8110 ave_time:7430777 total_time:60263607967
MESSAGE_NODE_REGISTRATION_STATUS ( 1002) count:4807 ave_time:1238519 total_time:5953564314
REQUEST_SUBMIT_BATCH_JOB ( 4003) count:4359 ave_time:5528213 total_time:24097481459
REQUEST_NODE_INFO ( 2007) count:2908 ave_time:331351 total_time:963571100
REQUEST_JOB_USER_INFO ( 2039) count:1545 ave_time:4458412 total_time:6888246749
REQUEST_KILL_JOB ( 5032) count:1460 ave_time:458577 total_time:669522605
REQUEST_JOB_STEP_CREATE ( 5001) count:84 ave_time:603755 total_time:50715441
REQUEST_JOB_PACK_ALLOC_INFO ( 4027) count:64 ave_time:895780 total_time:57329931
REQUEST_JOB_REQUEUE ( 5023) count:24 ave_time:14249 total_time:341979
REQUEST_UPDATE_NODE ( 3002) count:17 ave_time:2188597 total_time:37206165
REQUEST_JOB_READY ( 4019) count:16 ave_time:525303 total_time:8404860
REQUEST_STATS_INFO ( 2035) count:11 ave_time:366 total_time:4031
REQUEST_RESOURCE_ALLOCATION ( 4001) count:8 ave_time:11453958 total_time:91631667
REQUEST_SET_DEBUG_LEVEL ( 1010) count:8 ave_time:10982265 total_time:87858125
REQUEST_CANCEL_JOB_STEP ( 5005) count:4 ave_time:3817881 total_time:15271526
REQUEST_PING ( 1008) count:3 ave_time:196 total_time:589
REQUEST_JOB_ALLOCATION_INFO ( 4014) count:2 ave_time:203 total_time:407
REQUEST_COMPLETE_JOB_ALLOCATION ( 5017) count:2 ave_time:648750 total_time:1297501
Remote Procedure Call statistics by user
root ( 0) count:66925 ave_time:2952506 total_time:197596515575
ryderkh ( 483404) count:50684 ave_time:2117145 total_time:107305408118
buildbot ( 9206) count:22749 ave_time:132184 total_time:3007056754
cmspilot ( 9201) count:11257 ave_time:7761117 total_time:87366902948
ryderld ( 483348) count:8490 ave_time:2098260 total_time:17814234246
cartonf ( 762584) count:5670 ave_time:2264604 total_time:12840308283
hanl3 ( 664085) count:3899 ave_time:245437 total_time:956960073
vuiis_archive_singularity( 9211) count:3787 ave_time:243902 total_time:923657463
wangj57 ( 674623) count:2386 ave_time:1664445 total_time:3971367202
vanzod ( 389801) count:2280 ave_time:73035 total_time:166520587
weisszy ( 742600) count:1534 ave_time:1156463 total_time:1774014442
masapps ( 789320) count:537 ave_time:8396122 total_time:4508717924
autocms ( 9209) count:510 ave_time:1268545 total_time:646958049
lcgadmin ( 9204) count:215 ave_time:6644810 total_time:1428634226
uscmslocal ( 9203) count:200 ave_time:8709921 total_time:1741984227
vuiisccidev ( 5010) count:179 ave_time:5375555 total_time:962224485
masispiderdev ( 5009) count:160 ave_time:343156 total_time:54905037
boydb1 ( 177470) count:118 ave_time:2055822 total_time:242587084
yex3 ( 770085) count:89 ave_time:1598621 total_time:142277353
fany4 ( 756088) count:77 ave_time:3199339 total_time:246349173
vuiis_daily_singularity( 9210) count:76 ave_time:1724305 total_time:131047200
brownbp1 ( 353986) count:55 ave_time:7699074 total_time:423449109
tangz3 ( 260643) count:53 ave_time:853714 total_time:45246878
appelte1 ( 111694) count:47 ave_time:1017642 total_time:47829212
smiths21 ( 177663) count:47 ave_time:16906 total_time:794593
youg ( 780913) count:41 ave_time:490187 total_time:20097706
gaoy3 ( 93631) count:38 ave_time:869990 total_time:33059638
mothcw ( 20003) count:38 ave_time:7764888 total_time:295065764
chenz27 ( 747109) count:36 ave_time:3875531 total_time:139519148
kendalak ( 19853) count:31 ave_time:6818 total_time:211379
larsonke ( 645142) count:31 ave_time:1812314 total_time:56181735
kochenma ( 154990) count:29 ave_time:5597186 total_time:162318406
chenl11 ( 233286) count:28 ave_time:586499 total_time:16421985
rubinom ( 752904) count:28 ave_time:6379644 total_time:178630044
luy5 ( 423460) count:25 ave_time:2244263 total_time:56106577
yangah ( 497281) count:24 ave_time:269847 total_time:6476342
tetream1 ( 759143) count:23 ave_time:432057 total_time:9937332
mahone1 ( 754502) count:21 ave_time:2114742 total_time:44409591
labellal ( 693461) count:20 ave_time:137575 total_time:2751515
drakekr ( 518360) count:19 ave_time:2705064 total_time:51396224
shueymm ( 170568) count:19 ave_time:930886 total_time:17686841
mendenjl ( 136468) count:17 ave_time:1647876 total_time:28013899
fongsl ( 505319) count:14 ave_time:11332834 total_time:158659685
colbrall ( 242400) count:14 ave_time:7610379 total_time:106545313
liuf8 ( 776869) count:12 ave_time:496552 total_time:5958625
vandeks ( 777942) count:12 ave_time:806033 total_time:9672400
nakagat1 ( 175929) count:12 ave_time:49074 total_time:588892
ronymw ( 684864) count:12 ave_time:2614875 total_time:31378506
opokubc ( 333903) count:12 ave_time:50454 total_time:605450
hirboj ( 222580) count:11 ave_time:2411294 total_time:26524242
phant1 ( 791680) count:11 ave_time:400511 total_time:4405628
hettk ( 782611) count:10 ave_time:5323810 total_time:53238109
ahmedha ( 793676) count:10 ave_time:23374 total_time:233745
wangj52 ( 606781) count:10 ave_time:15452 total_time:154520
daix4 ( 770985) count:9 ave_time:1808693 total_time:16278241
clarklc ( 87722) count:9 ave_time:4143369 total_time:37290325
scholzmb ( 421502) count:8 ave_time:65009 total_time:520075
schoedct ( 458021) count:8 ave_time:3384854 total_time:27078838
smithst ( 237602) count:7 ave_time:10175115 total_time:71225806
jashnsh ( 675240) count:7 ave_time:29776119 total_time:208432838
weiq1 ( 200524) count:7 ave_time:66574 total_time:466018
hoangtt ( 347007) count:7 ave_time:14593 total_time:102154
sealocjm ( 638884) count:7 ave_time:2190477 total_time:15333339
strickt2 ( 175381) count:6 ave_time:42044 total_time:252264
pingj2 ( 686562) count:6 ave_time:1151584 total_time:6909507
hej4 ( 121574) count:5 ave_time:13986 total_time:69930
abraha1 ( 353983) count:5 ave_time:1236054 total_time:6180273
wangj36 ( 337100) count:5 ave_time:312348 total_time:1561743
sotocs ( 574202) count:4 ave_time:30997 total_time:123990
mukhes1 ( 479945) count:4 ave_time:1185148 total_time:4740593
garnetj ( 112602) count:4 ave_time:56857 total_time:227429
zhans23 ( 768664) count:4 ave_time:3337852 total_time:13351410
bentonml ( 344973) count:3 ave_time:2624886 total_time:7874658
baiy7 ( 794686) count:3 ave_time:3162 total_time:9487
shamappk ( 649321) count:3 ave_time:628 total_time:1884
tuos ( 126986) count:3 ave_time:668 total_time:2004
niarchm ( 787052) count:3 ave_time:495810 total_time:1487430
shashin ( 760018) count:3 ave_time:59671 total_time:179014
joshin ( 688481) count:2 ave_time:11415425 total_time:22830850
luoh3 ( 649317) count:1 ave_time:31434 total_time:31434
kenkelb ( 248940) count:1 ave_time:4538722 total_time:4538722
Pending RPC statistics
REQUEST_TERMINATE_JOB ( 6011) count:32
REQUEST_LAUNCH_PROLOG ( 6017) count:8
REQUEST_BATCH_JOB_LAUNCH ( 4005) count:8
Pending RPCs
1: REQUEST_TERMINATE_JOB cn332
2: REQUEST_TERMINATE_JOB cn405
3: REQUEST_LAUNCH_PROLOG cn1126
4: REQUEST_BATCH_JOB_LAUNCH cn1126
5: REQUEST_LAUNCH_PROLOG cn1092
6: REQUEST_BATCH_JOB_LAUNCH cn1092
7: REQUEST_LAUNCH_PROLOG cn1093
8: REQUEST_BATCH_JOB_LAUNCH cn1093
9: REQUEST_LAUNCH_PROLOG cn1411
10: REQUEST_BATCH_JOB_LAUNCH cn1411
11: REQUEST_LAUNCH_PROLOG cn1412
12: REQUEST_BATCH_JOB_LAUNCH cn1412
13: REQUEST_LAUNCH_PROLOG cn456
14: REQUEST_BATCH_JOB_LAUNCH cn456
15: REQUEST_LAUNCH_PROLOG cn474
16: REQUEST_BATCH_JOB_LAUNCH cn474
17: REQUEST_LAUNCH_PROLOG cn489
18: REQUEST_BATCH_JOB_LAUNCH cn489
19: REQUEST_TERMINATE_JOB cn1401
20: REQUEST_TERMINATE_JOB cn1310
21: REQUEST_TERMINATE_JOB cn1310
22: REQUEST_TERMINATE_JOB cn1312
23: REQUEST_TERMINATE_JOB cn1312
24: REQUEST_TERMINATE_JOB cn1331
25: REQUEST_TERMINATE_JOB cn1127
By looking at the past logs I have also noticed that the following error appeared in the last month or so and last night we got a lot of them. I do not know if it is in any way related to the issue we observed but I thought it was worth reporting it and hopefully get some clarification on its meaning from you. [2019-03-05T00:40:23.802] error: select/cons_res: node cn1235 memory is under-allocated (9536-65536) for JobId=6914042 (In reply to Davide Vanzo from comment #30) > By looking at the past logs I have also noticed that the following error > appeared in the last month or so and last night we got a lot of them. I do > not know if it is in any way related to the issue we observed but I thought > it was worth reporting it and hopefully get some clarification on its > meaning from you. I don't currently think this is cause of your issue this morning but we should try to work on optimizing your scheduler configuration. We should look at changing these values in SchedulerParameters: > batch_sched_delay=10 > sched_max_job_start=200 > max_rpc_cnt=400 Can you please change them and send in a debug log? --Nate Can you please provide this output:
> sacctmgr show stats
$ sacctmgr show stats
Rollup statistics
Hour count:8 ave_time:4279414 max_time:5110583 total_time:34235319
Day count:8 ave_time:1329805 max_time:2724320 total_time:10638444
Month count:0 ave_time:0 max_time:0 total_time:0
Remote Procedure Call statistics by message type
DBD_STEP_START ( 1442) count:42863 ave_time:353 total_time:15154512
DBD_STEP_COMPLETE ( 1441) count:42815 ave_time:458 total_time:19634303
DBD_SEND_MULT_MSG ( 1474) count:17433 ave_time:1914 total_time:33370022
DBD_JOB_COMPLETE ( 1424) count:16948 ave_time:310 total_time:5269705
DBD_FINI ( 1401) count:14359 ave_time:138 total_time:1982553
SLURM_PERSIST_INIT ( 6500) count:11291 ave_time:505 total_time:5705789
DBD_GET_JOBS_COND ( 1444) count:8208 ave_time:2821 total_time:23158786
DBD_JOB_START ( 1425) count:6571 ave_time:346 total_time:2274441
DBD_SEND_MULT_JOB_START ( 1472) count:3479 ave_time:4614 total_time:16052534
DBD_GET_ACCOUNTS ( 1409) count:2972 ave_time:483 total_time:1435998
DBD_GET_TRES ( 1486) count:1318 ave_time:134 total_time:177159
DBD_NODE_STATE ( 1432) count:619 ave_time:195 total_time:120911
DBD_GET_ASSOCS ( 1410) count:106 ave_time:368049 total_time:39013212
DBD_GET_QOS ( 1448) count:106 ave_time:342 total_time:36297
DBD_CLUSTER_TRES ( 1407) count:99 ave_time:704 total_time:69760
DBD_GET_FEDERATIONS ( 1494) count:5 ave_time:307 total_time:1535
DBD_GET_USERS ( 1415) count:5 ave_time:75653 total_time:378268
DBD_GET_RES ( 1478) count:4 ave_time:227 total_time:910
DBD_REGISTER_CTLD ( 1434) count:4 ave_time:611 total_time:2445
Remote Procedure Call statistics by user
slurm ( 59229) count:130860 ave_time:713 total_time:93414212
cartonf ( 762584) count:30471 ave_time:685 total_time:20893673
cmspilot ( 9201) count:4470 ave_time:674 total_time:3013286
vuiis_archive_singularity( 9211) count:1548 ave_time:856 total_time:1326078
root ( 0) count:650 ave_time:58860 total_time:38259148
autocms ( 9209) count:324 ave_time:16565 total_time:5367114
lcgadmin ( 9204) count:252 ave_time:835 total_time:210654
buildbot ( 9206) count:222 ave_time:962 total_time:213779
uscmslocal ( 9203) count:144 ave_time:700 total_time:100815
vuiisccidev ( 5010) count:45 ave_time:1179 total_time:53083
boydb1 ( 177470) count:27 ave_time:855 total_time:23100
vuiis_daily_singularity( 9210) count:27 ave_time:811 total_time:21913
chenz27 ( 747109) count:20 ave_time:267 total_time:5352
hanl3 ( 664085) count:20 ave_time:250 total_time:5011
liuf8 ( 776869) count:18 ave_time:32856 total_time:591412
jashnsh ( 675240) count:16 ave_time:252 total_time:4045
ronymw ( 684864) count:16 ave_time:255 total_time:4095
labellal ( 693461) count:12 ave_time:240 total_time:2886
vuot2 ( 491383) count:12 ave_time:1072 total_time:12869
vanzod ( 389801) count:11 ave_time:14169 total_time:155865
masispiderdev ( 5009) count:9 ave_time:850 total_time:7658
hej4 ( 121574) count:8 ave_time:251 total_time:2015
mahone1 ( 754502) count:6 ave_time:958 total_time:5752
breediml ( 657618) count:4 ave_time:241 total_time:965
chakrs1 ( 338343) count:4 ave_time:259 total_time:1036
abraha1 ( 353983) count:3 ave_time:13927 total_time:41781
opokubc ( 333903) count:3 ave_time:962 total_time:2887
ahmedha ( 793676) count:3 ave_time:32885 total_time:98656
Davide, Do you have any connection flow logs that we could use to see how many connections were being attempted to your slurmctld? Thanks, --Nate Nate, I have applied the changes you requested to the scheduler configuration. Unfortunately the frequency of the very large processing time warnings has not changed. Do you have any insight regarding the cause of the under-allocated memory error messages? As for the connection flow logs, how can we collect them? Davide (In reply to Davide Vanzo from comment #38) > I have applied the changes you requested to the scheduler configuration. > Unfortunately the frequency of the very large processing time warnings has > not changed. > > Do you have any insight regarding the cause of the under-allocated memory > error messages? Can you please provide the following output along with your slurmctld logs in debug2 (or higher)? > sdiag > sacctmgr show stats > As for the connection flow logs, how can we collect them? Usually your firewall or managed switch would collect them. I have used just simple iptables log rules with conntrack to collect them. If you don't already have them then current logs will not be helpful. --Nate Created attachment 9484 [details] Comment #38 requested logs Nate, I have attached the log and output you requested. As for tracking connections, I am following up with out system administrators right now. Do you have any recommendation on how to set iptables? Davide (In reply to Davide Vanzo from comment #41) > I have attached the log and output you requested. I am reviewing them now. > As for tracking connections, I am following up with out system > administrators right now. Do you have any recommendation on how to set > iptables? This has worked for before: > iptables -I OUTPUT -m conntrack --ctstate INVALID,NEW,UNTRACKED -m limit --limit 100/sec -j LOG --log-prefix NF-OUTBOUND --log-uid > iptables -I INPUT -m conntrack --ctstate INVALID,NEW,UNTRACKED -m limit --limit 100/sec -j LOG --log-prefix NF-INBOUND --log-uid > iptables -I FORWARD -m conntrack --ctstate INVALID,NEW,UNTRACKED -m limit --limit 100/sec -j LOG --log-prefix NF-FWD These logs would only be useful during the event though. I take it that the event has not happened again? Nate, No, we have not experienced issues with the inability of Slurm to kill slurmstepd with zombie processes since then. And the causes are still unknown for us. I still believe it would be helpful to track the connections anyway since it can give additional insights in the future, especially when related to RPC processing. Davide (In reply to Davide Vanzo from comment #43) > No, we have not experienced issues with the inability of Slurm to kill > slurmstepd with zombie processes since then. And the causes are still > unknown for us. Then I will continue looking at the logs to determine the cause. > I still believe it would be helpful to track the connections anyway since it > can give additional insights in the future, especially when related to RPC > processing. There will be a performance hit on compute nodes for logging, but it's hard to say how much. Also, may want to make the log-prefix have a space, the copy and paste missed that. Oh, so you are suggesting to collect connection logs not only on the controller, but also on the compute nodes? DV (In reply to Davide Vanzo from comment #45) > Oh, so you are suggesting to collect connection logs not only on the > controller, but also on the compute nodes? The controller logs would be more than sufficient to see how many connections are being attempted. Logs on the compute would tell you how many aren't even reaching the controller. (In reply to Davide Vanzo from comment #38) > Do you have any insight regarding the cause of the under-allocated memory > error messages? Can you please provide the output to these commands: > sacct -lp -j 6951056_6530 > sacct -lp -j 6951056_6535 > sacct -lp -j 6951056_6543 > sacct -lp -j 6951056_6552 > sacct -lp -j 6951056_6706 > sacct -lp -j 6951056_6723 > sacct -lp -j 6965971_29 Created attachment 9491 [details]
sacct output for c49
(In reply to Davide Vanzo from comment #50) > Created attachment 9491 [details] > sacct output for c49 Can you please provide this output? > sacctmgr show events Nodes=cn1207,cn436,cn456,cn1388,cn1355,cn1386,cn1216 (In reply to Nate Rini from comment #55) > (In reply to Davide Vanzo from comment #50) > > Created attachment 9491 [details] > > sacct output for c49 > > Can you please provide this output? > > sacctmgr show events Nodes=cn1207,cn436,cn456,cn1388,cn1355,cn1386,cn1216 And your qos setting: > sacctmgr show -p qos Here is the output you requested. $ sacctmgr -p show events Nodes=cn1207,cn436,cn456,cn1388,cn1355,cn1386,cn1216 Cluster|NodeName|TimeStart|TimeEnd|State|Reason|User| vampire|cn1355|2019-03-06T19:32:35|2019-03-07T13:07:01|DOWN|Node unexpectedly rebooted|slurm(59229)| $ sacctmgr show -p qos Name|Priority|GraceTime|Preempt|PreemptMode|Flags|UsageThres|UsageFactor|GrpTRES|GrpTRESMins|GrpTRESRunMins|GrpJobs|GrpSubmit|GrpWall|MaxTRES|MaxTRESPerNode|MaxTRESMins|MaxWall|MaxTRESPU|MaxJobsPU|MaxSubmitPU|MaxTRESPA|MaxJobsPA|MaxSubmitPA|MinTRES| normal|0|00:00:00|lowprio|cluster|||1.000000|||||||||||||||||| accre_pascal|0|00:00:00||cluster|||1.000000|gres/gpu=4||||||||||||||||| buildbot_debug|0|00:00:00||cluster|OverPartQOS||1.000000||||||||||1-00:00:00||100|200||||| debug|0|00:00:00||cluster|||1.000000||||||||||00:30:00||2|5||||| lowprio|0|00:05:00||cluster|||1.000000|||||||||||||||||| nogpfs|0|00:00:00||cluster|||1.000000|cpu=9999,mem=99999999M||cpu=99999999||||||||||||||| csb_pascal|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=60||||||||||||||||| cms_pascal|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=1||||||||||||||||| fe_accre_lab_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| accre_gpu_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=48||||||||||||||||| nbody_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=32||||||||||||||||| h_fabbri_gpu_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| h_lasko_lab_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| lss_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=8||||||||||||||||| mip_eecs_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| csb_gpu_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=44||||||||||||||||| p_neuert_lab_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| grissom_lab_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| johnston_group_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| photogrammetry_gpu_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| p_malin_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| dbmi_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| math_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| capra_lab_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| lola_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| h_vuni_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| aivas_lab_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| segrest_group_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| anderson_mri_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| accre_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=40||||||||||||||||| csb_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=60||||||||||||||||| psychology_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=8||||||||||||||||| p_masi_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=16||||||||||||||||| cgg_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=8||||||||||||||||| beam_lab_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=8||||||||||||||||| mccabe_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=8||||||||||||||||| biostat_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| chbe285_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| cerl_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| ipoda_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| grissom_lab_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| king_lab_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| maha_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| smith_lab_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| rokaslab_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| cs4269_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| cs3891_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| rer_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| qlab_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| jswhep_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| cummings_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| cs839502_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| cms_gpu_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| lcgadmin_debug|1000000|00:00:00||cluster|OverPartQOS||1.000000||||||||||2-00:00:00||20|100||||| h_vuiis_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=4||||||||||||||||| h_biostat_student_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| neurogroup_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=2||||||||||||||||| sc3260_maxwell_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=12||||||||||||||||| sc3260_pascal_acc|0|00:00:00||cluster|OverPartQOS||1.000000|gres/gpu=12||||||||||||||||| (In reply to Davide Vanzo from comment #63) > Here is the output you requested. Is it possible to run gdb or apply a test patch to your slurmctld? Nate, This is running in production, so it is always tricky to apply patches. Would the patch be limited to slurmctld or also slurmd? Also let me know how you would like me to trace Slurm with GDB. DV Created attachment 9520 [details] debug patch (In reply to Davide Vanzo from comment #69) > This is running in production, so it is always tricky to apply patches. I have attached a patch that we can try if you get an outage in the near future. My attempts to replicate your issue have not worked yet. > Would the patch be limited to slurmctld or also slurmd? The memory issue is limited to slurmctld. The thundering herd issue of job updates is likely both but I'm limiting this to one issue at a time. > Also let me know how you would like me to trace Slurm with GDB. I rather avoid that with a production system if we can. Can you please watch the logs and then do this command against any job that causes a under-allocated or over-allocated memory error before the job is pushed to slurmdbd? > scontrol show job $JOBID (In reply to Davide Vanzo from comment #2) > Created attachment 9438 [details] > slurmctld log Looking over the logs from the outage, it appears your cluster has the ideal use case for aggregated messaging. I have been able to partially recreate your issue by loading a pile of array jobs and then killing slurmctld. Slurmctld tried to terminate the jobs after recovering its state which caused a huge flood of step completion RPCs. Please try setting the following your slurm.conf during your next outage: > MsgAggregationParams=WindowMsgs=10,WindowTime=100 > TreeWidth=75 We may need to tweak these numbers depending on how the number of steps running at any one time. Here is a presentation about Message Aggregation: > https://slurm.schedmd.com/SLUG15/Message_Aggregation.pdf Nate, I will work on patching the slurmctld as soon as possible. As for the message aggregation, from your message I infer that this should be only an "emergency" measure when the controller cannot manage all the coming RPC. Is this correct or is this something that can be left in the configuration independently on the state of the controller? What is the downside of this last option? DV (In reply to Davide Vanzo from comment #77) > As for the message aggregation, from your message I infer that this should > be only an "emergency" measure when the controller cannot manage all the > coming RPC. This is a long term configuration change for your cluster. All the Slurm daemons will need to be restarted. I managed to replicate your issue by submitting a large number of array jobs with good number of steps, then restarting slurmctld. If you kill (or if your policy auto kills the jobs) the jobs while it's still starting up, then the issue will most likely trigger. > What is the downside of this last option? Slurm will use slurmd daemons to aggregate updates to slurmctld. If you constantly lose slurmd daemons (pls open ticket if you do), then this will cause delays in updates. There will be increase in latency for the aggregated RPCs but we should be able to tweak the settings to make that minimal. You may want to setup RoutePlugin if your management network is not flat. I strongly suggest testing this setup before deploying to production, but several other sites are already using it in production. It can also be turned off with a config change and a daemon restart if there are issues. Nate, I have turned on message aggregation and it does not seem to have improved the situation. To ensure the configuration is correctly propagated, after pushing the changes to slurm.conf I have executed an scontrol shutdown, restarted all slurmd and then restarted the slurmctld. The first thing to notice is the continuous appearance of many of those messages in the slurmctld log: > error: Munge decode failed: Replayed credential > ENCODED: Mon Mar 18 12:07:52 2019 > DECODED: Mon Mar 18 12:07:53 2019 > error: authentication: Replayed credential However munge is correctly configured and the controller can communicate fine with the slurmd deamons. No unusual messages are present in the slurmd logs. The following messages are present with the same frequency as before: > Warning: Note very large processing time from _slurm_rpc_dump_jobs: usec=6323180 began=12:18:09.865 > backfill: continuing to yield locks, 54 RPCs pending Davide The other problem is that since 10:48am no job has been started on the cluster by Slurm according to the controller logs. At that same time we observed an increase on the GPFS disk wait times whose cause is not yet clear to us. Although the correlation seem to be pretty high, our Slurm setup is completely decoupled from the parallel filesystem. The controllers do not mount it, the database is in a dedicated hard drive and all the Slurm components are installed in the local drives with the OS. This was done to avoid the parallel filesystem performance issues to affect Slurm but for some reason we still see this type of events correlation. For completeness I have attached the latest output of sdiag. Davide Created attachment 9597 [details] sdiag output - Comment #80 (In reply to Davide Vanzo from comment #80) > The other problem is that since 10:48am no job has been started on the > cluster by Slurm according to the controller logs. Please turn off the message aggregation and attach slurmctld logs and at least one node's logs. Ok, another update. I have reverted the changes and the scheduler still does not start any job. It accepts new job submissions and correctly keeps track of currently running job but no new job is started on the cluster.
Another thing that came out in the log is the following messages related to two nodes currently drained with hardware issues:
> error: Node cn402 has low real_memory size (64216 < 128714)
> error: _slurm_rpc_node_registration node=cn402: Invalid argument
> error: Node cn463 has low real_memory size (64216 < 128714)
> error: _slurm_rpc_node_registration node=cn463: Invalid argument
I have raised the importance of this ticket since it is paramount that we understand and fix why Slurm is not starting any new job.
Davide
Created attachment 9599 [details] slurmctld log - Comment #82 Created attachment 9600 [details] slurmd log - Comment #82 (In reply to Davide Vanzo from comment #81) > Created attachment 9597 [details] > sdiag output - Comment #80 > Jobs submitted: 672 > Jobs started: 0 > Jobs completed: 126 > Jobs canceled: 115 Are you explicitly cancelling these jobs? > Jobs failed: 0 (In reply to Davide Vanzo from comment #83) > Ok, another update. I have reverted the changes and the scheduler still does > not start any job. It accepts new job submissions and correctly keeps track > of currently running job but no new job is started on the cluster. Did you sync slurm.conf and restart all Slurm daemons? > Another thing that came out in the log is the following messages related to > two nodes currently drained with hardware issues: > > error: Node cn402 has low real_memory size (64216 < 128714) > > error: _slurm_rpc_node_registration node=cn402: Invalid argument > > error: Node cn463 has low real_memory size (64216 < 128714) > > error: _slurm_rpc_node_registration node=cn463: Invalid argument Can you please verify all the node memory is showing on those nodes with 'slurmd -C' and 'free -h'? Some memory DIMMs may have died during a reboot. > I have raised the importance of this ticket since it is paramount that we > understand and fix why Slurm is not starting any new job. Looking at your logs now. > Are you explicitly cancelling these jobs? Not me but users might be doing it. > Did you sync slurm.conf and restart all Slurm daemons? Yes. As before I went through a full slurm.conf sync, scontrol shutdown, slurmd restart, slurmctld restart. In this order. > Can you please verify all the node memory is showing on those nodes with > 'slurmd -C' and 'free -h'? Some memory DIMMs may have died during a reboot. Yes, those nodes have been set as down last week for memory issues. So why is Slurm showing those error messages only now? (In reply to Nate Rini from comment #86) > (In reply to Davide Vanzo from comment #81) > Looking at your logs now. >[2019-03-17T05:26:56.884] [7177337.batch] error: couldn't chdir to `/var/lib/condor-ce/spool/7100/0/cluster3037100.proc0.subproc0': No such file or directory: going to /tmp instead Is this a network FS? Can you please verify your filesystem the mounted? > [2019-03-18T12:03:35.136] [7174618.batch] error: *** JOB 7174618 ON cn1424 CANCELLED AT 2019-03-18T12:03:35 DUE TO TIME LIMIT *** Can you please verify how many nodes have time limit cancel message? This may explain all the cancelled jobs. > [2019-03-18T11:57:40.274] error: Munge decode failed: Replayed credential > [2019-03-18T11:57:40.274] ENCODED: Mon Mar 18 11:57:39 2019 > [2019-03-18T11:57:40.274] DECODED: Mon Mar 18 11:57:40 2019 I suspect the slurmstepd and slurmd are out of sync causing this. Can you please call 'remunge' on all of your nodes. Can you please provide the output of the following commands: > sinfo -Nel > sinfo --list-reasons > scontrol show jobs > scontrol show partitions > scontrol show config > sacctmgr show stat > sdiag #again please Is it possible to gcore your slurmctld and attach 'apply thread all bt full' and 'apply thread all bt'? (In reply to Davide Vanzo from comment #87) > > Are you explicitly cancelling these jobs? > Not me but users might be doing it. Might be jobs dieing from wall clock as noted above. Are you seeing any filesystem hangs? > > Did you sync slurm.conf and restart all Slurm daemons? > Yes. As before I went through a full slurm.conf sync, scontrol shutdown, > slurmd restart, slurmctld restart. In this order. That is correct. > > Can you please verify all the node memory is showing on those nodes with > > 'slurmd -C' and 'free -h'? Some memory DIMMs may have died during a reboot. > Yes, those nodes have been set as down last week for memory issues. So why > is Slurm showing those error messages only now? The slurmd daemon may have restarted on those nodes. Stopping the daemon should stop those errors while hardware repairs are outstanding. Can you please add this to your slurm.conf:
> DebugFlags=Backfill,SelectType
> SlurmctldDebug=debug4
Please let if run for at least 10mins and then upload the logs and reverse the settings.
> [2019-03-18T03:45:57.245] error: Node ng1003 appears to have a different slurm.conf than the slurmctld. This could cause issues with communication and functionality. Please review both files and make sure they are the same. If this is expected ignore, and set DebugFlags=NO_CONF_HASH in your slurm.conf.
Please check that slurm.conf is synced on ng1003.
> 2019-03-18T13:12:35.563] Warning: Note very large processing time from _slurm_rpc_node_registration: usec=4738232 began=13:12:30.825
It took 80mins to register a node. Please see if you can get gcore from slurmctld, I'm concerned the cluster managed to find a deadlock or real lock starvation since this registration should be very fast.
Can you also please restart slurmdbd and attach its logs.
> Is this a network FS? Can you please verify your filesystem the mounted? Yes, that is an NFS mount. > Can you please verify how many nodes have time limit cancel message? This > may explain all the cancelled jobs. I cannot parse through all the compute nodes log, but according to the Slurm database there has been 132 timeouts on 106 nodes. I do not see anything suspicious in those jobs. Do you? > I suspect the slurmstepd and slurmd are out of sync causing this. Can you > please call 'remunge' on all of your nodes. After I removed the message aggregation the error disappeared. > sinfo -Nel > > sinfo --list-reasons > > scontrol show jobs > > scontrol show partitions > > scontrol show config > > sacctmgr show stat > > sdiag #again please See attached. > Is it possible to gcore your slurmctld and attach 'apply thread all bt full' > and 'apply thread all bt'? > Might be jobs dieing from wall clock as noted above. Are you seeing any > filesystem hangs? No, the filesystem seems ok. Created attachment 9603 [details] Output files - Comment #88 The slurmctld logs (from before) show that the jobs are not starting due to the nodes being busy which is shouldn't happen since the machine is using max wallclocks. I see a lot of running jobs, can you please provide another sdiag output?
> ryderkh ( 483404) count:17084 ave_time:1282640 total_time:21912629526
Can you please check if this user is doing something to query slurm alot? Maybe calling scontrol show jobs?
Created attachment 9605 [details]
GDB slurmctld
(In reply to Davide Vanzo from comment #96) > Created attachment 9605 [details] > GDB slurmctld If possible, can you please grab another bt dump? All the threads are waiting for something to happen. I want to see if that changes or there is just a deadlock. (In reply to Nate Rini from comment #99) > (In reply to Davide Vanzo from comment #96) > > Created attachment 9605 [details] > > GDB slurmctld > > If possible, can you please grab another bt dump? All the threads are > waiting for something to happen. I want to see if that changes or there is > just a deadlock. Can you also please provide this output: > slurmctld -V The line numbers in the backtrace are not matching the lines in the source. Created attachment 9606 [details]
slurmdbd logs
Nate, The problem is that keepalived triggered a failover to the secondary controller and this caused a restart of both slurmctld and slurmdbd. This is why I attached two logs for slurmdbd, one per controller. I will generate another dump, but this is clearly not the same execution instance as the previous one. (In reply to Davide Vanzo from comment #101) > Created attachment 9606 [details] > slurmdbd logs I suggest fixing the permissions on this file to avoid running duplicate slurmdbd: > [2019-03-18T15:46:37.190] Unable to remove pidfile '/var/run/slurmdbd.pid': Permission denied (In reply to Davide Vanzo from comment #101) > Created attachment 9606 [details] > slurmdbd logs Can we turn up the logging on slurmdbd? It is getting a new persistent connection about every second, which it should be once per slurmctld startup normally. > DebugFlags=DB_QUERY > DebugLevel=debug3 (In reply to Davide Vanzo from comment #102) > I will generate another dump, but this is clearly not the same execution > instance as the previous one. That will work. It will tell us if the locks change formation. Can you also provide this from mysql (or mariadb)?
> mysql> SHOW VARIABLES LIKE "max_connections";
> mysql> show status where `variable_name` = 'Threads_connected';
> mysql> show processlist;
> mysql> SHOW ENGINE INNODB STATUS;
The owner of the pid file is slurm and the permissions seem ok to me. Can you please confirm?
> # ll /var/run/slurm*
> -rw-r--r-- 1 slurm root 7 Mar 18 15:46 /var/run/slurmctld.pid
> -rw-r--r-- 1 slurm root 7 Mar 18 15:46 /var/run/slurmdbd.pid
(In reply to Davide Vanzo from comment #107) > The owner of the pid file is slurm and the permissions seem ok to me. Can > you please confirm? > > > # ll /var/run/slurm* > > -rw-r--r-- 1 slurm root 7 Mar 18 15:46 /var/run/slurmctld.pid > > -rw-r--r-- 1 slurm root 7 Mar 18 15:46 /var/run/slurmdbd.pid Slurm is calling unlink(), which requires write permissions to /var/run/ which is probably not ideal. This can be ignored or you can put the pids in a writable by slurm subdir. (In reply to Nate Rini from comment #104) > Can we turn up the logging on slurmdbd? It is getting a new persistent > connection about every second, which it should be once per slurmctld startup > normally. > > DebugFlags=DB_QUERY > > DebugLevel=debug3 We will also want in the slurm.conf: DebugFlags += Protocol Along with the slurmctld logs requested earlier. I would really like to know why it is connecting so much. Have any jobs started? Can we get an sdiag output too please. (In reply to Nate Rini from comment #95) > The slurmctld logs (from before) show that the jobs are not starting due to > the nodes being busy which is shouldn't happen since the machine is using > max wallclocks. I see a lot of running jobs, can you please provide another > sdiag output? $ sdiag ******************************************************* sdiag output at Mon Mar 18 17:01:08 2019 (1552946468) Data since Mon Mar 18 16:57:20 2019 (1552946240) ******************************************************* Server thread count: 23 Agent queue size: 1 Agent count: 2 DBD Agent queue size: 0 Jobs submitted: 342 Jobs started: 3 Jobs completed: 74 Jobs canceled: 5 Jobs failed: 0 Job states ts: Mon Mar 18 17:00:54 2019 (1552946454) Jobs pending: 11722 Jobs running: 2558 Main schedule statistics (microseconds): Last cycle: 1810115 Max cycle: 2156795 Total cycles: 83 Mean cycle: 1731276 Mean depth cycle: 32 Cycles per minute: 27 Last queue length: 3232 Backfilling stats (WARNING: data obtained in the middle of backfilling execution.) Total backfilled jobs (since last slurm start): 0 Total backfilled jobs (since last stats cycle start): 0 Total backfilled heterogeneous job components: 0 Total cycles: 0 Last cycle when: Mon Mar 18 16:59:18 2019 (1552946358) Last cycle: 0 Max cycle: 0 Last depth cycle: 11 Last depth cycle (try sched): 11 Last queue length: 3173 Latency for gettimeofday() (x1000): 47 microseconds Remote Procedure Call statistics by message type REQUEST_PARTITION_INFO ( 2009) count:1744 ave_time:185 total_time:323016 MESSAGE_NODE_REGISTRATION_STATUS ( 1002) count:1010 ave_time:987983 total_time:997863713 REQUEST_FED_INFO ( 2049) count:438 ave_time:230 total_time:100988 REQUEST_JOB_INFO_SINGLE ( 2021) count:310 ave_time:22559035 total_time:6993301150 REQUEST_JOB_INFO ( 2003) count:294 ave_time:15851692 total_time:4660397516 REQUEST_SUBMIT_BATCH_JOB ( 4003) count:103 ave_time:18513349 total_time:1906875028 REQUEST_NODE_INFO ( 2007) count:95 ave_time:300822 total_time:28578181 MESSAGE_EPILOG_COMPLETE ( 6012) count:81 ave_time:8596564 total_time:696321709 REQUEST_COMPLETE_BATCH_SCRIPT ( 5018) count:76 ave_time:4699468 total_time:357159582 REQUEST_STEP_COMPLETE ( 5016) count:73 ave_time:3037651 total_time:221748594 REQUEST_JOB_USER_INFO ( 2039) count:45 ave_time:9203369 total_time:414151635 REQUEST_KILL_JOB ( 5032) count:15 ave_time:7983448 total_time:119751723 REQUEST_PING ( 1008) count:8 ave_time:186 total_time:1491 REQUEST_COMPLETE_PROLOG ( 6018) count:3 ave_time:740550 total_time:2221650 REQUEST_STATS_INFO ( 2035) count:0 ave_time:0 total_time:0 Remote Procedure Call statistics by user root ( 0) count:2756 ave_time:869957 total_time:2397601783 cmspilot ( 9201) count:812 ave_time:12851420 total_time:10435353709 ronymw ( 684864) count:112 ave_time:5575520 total_time:624458280 cmslocal ( 9202) count:93 ave_time:5336246 total_time:496270965 cartonf ( 762584) count:77 ave_time:6112492 total_time:470661922 mcarthe ( 515423) count:74 ave_time:4005532 total_time:296409387 wangj57 ( 674623) count:70 ave_time:2286312 total_time:160041846 smithmb3 ( 527620) count:40 ave_time:5042051 total_time:201682040 suarezm ( 715880) count:39 ave_time:3962171 total_time:154524690 reasatt ( 765549) count:39 ave_time:4034097 total_time:157329808 nandaa ( 780085) count:34 ave_time:4680525 total_time:159137864 ahmedha ( 793676) count:28 ave_time:5623322 total_time:157453037 steenwj ( 481000) count:23 ave_time:4010442 total_time:92240180 polikoh ( 176539) count:21 ave_time:4864118 total_time:102146479 xul13 ( 764176) count:17 ave_time:12743013 total_time:216631226 boydb1 ( 177470) count:17 ave_time:1027350 total_time:17464950 zhoud2 ( 743703) count:12 ave_time:3375693 total_time:40508321 santore ( 780819) count:10 ave_time:2555900 total_time:25559007 uscmslocal ( 9203) count:9 ave_time:12382747 total_time:111444724 larsonke ( 645142) count:7 ave_time:4434266 total_time:31039863 kochenma ( 154990) count:3 ave_time:16945146 total_time:50835440 vuiis_daily_singularity( 9210) count:1 ave_time:220 total_time:220 vuiis_archive_singularity( 9211) count:1 ave_time:235 total_time:235 vanzod ( 389801) count:0 ave_time:0 total_time:0 Pending RPC statistics REQUEST_TERMINATE_JOB ( 6011) count:1 Pending RPCs 1: REQUEST_TERMINATE_JOB cn1278 > Can you please check if this user is doing something to query slurm alot? > Maybe calling scontrol show jobs? The user had 52 Perl scripts that were somehow querying Slurm. We killed all the running processes related to this script. Created attachment 9607 [details] GD - Comment #100 > If possible, can you please grab another bt dump? All the threads are > waiting for something to happen. I want to see if that changes or there is > just a deadlock. See attached. > Can you also please provide this output: > > slurmctld -V > The line numbers in the backtrace are not matching the lines in the source. # slurmctld -V slurm 18.08.4 Created attachment 9608 [details]
MariaDB output
(In reply to Davide Vanzo from comment #110) > > Can you please check if this user is doing something to query slurm alot? > > Maybe calling scontrol show jobs? > > The user had 52 Perl scripts that were somehow querying Slurm. We killed all > the running processes related to this script. Each one of those RPCs grabs a read only lock which can slow down everything depending on what was being queried. > We will also want in the slurm.conf:
> DebugFlags += Protocol
> Along with the slurmctld logs requested earlier. I would really like to know
> why it is connecting so much.
Can you please confirm that those are the settings you are requesting?
DebugFlags=DB_QUERY,Protocol
DebugLevel=debug3
(In reply to Davide Vanzo from comment #116) > > We will also want in the slurm.conf: > > DebugFlags += Protocol > > Along with the slurmctld logs requested earlier. I would really like to know > > why it is connecting so much. > > Can you please confirm that those are the settings you are requesting? > > DebugFlags=DB_QUERY,Protocol > DebugLevel=debug3 Yes for slurmdbd.conf (In reply to Nate Rini from comment #117) > (In reply to Davide Vanzo from comment #116) > > > We will also want in the slurm.conf: > > > DebugFlags += Protocol > > > Along with the slurmctld logs requested earlier. I would really like to know > > > why it is connecting so much. > > > > Can you please confirm that those are the settings you are requesting? > > > > DebugFlags=DB_QUERY,Protocol > > DebugLevel=debug3 > > Yes for slurmdbd.conf Correction: Slurmdbd.conf: > > DebugFlags=DB_QUERY > > DebugLevel=debug3 Slurm.conf: Add Protocol to DebugFlags (In reply to Nate Rini from comment #118) > Slurm.conf: > Add Protocol to DebugFlags In case it is confusing, please set this in slurm.conf: > DebugFlags=Backfill,Protocol > SlurmctldDebug=debug3 (In reply to Davide Vanzo from comment #110) > The user had 52 Perl scripts that were somehow querying Slurm. We killed all > the running processes related to this script. Can you please check this user too? > cartonf ( 762584) count:351399 ave_time:786 total_time:276480878 Nate, The user only had a couple of watch processes that were pinging squeue. Attached you can find the slurmdbd and slurmctld with increased verbosity. Created attachment 9609 [details]
Slurm logs - Commen #119
Good morning. Today the issue of failed attempts at terminating jobs reappeared again. We had to manually kill all the stuck slurmstepd processes to get the controller back in a more responsive state. Created attachment 9617 [details] slurmctld log - Comment #124 *******************************************************
sdiag output at Tue Mar 19 08:45:45 2019 (1553003145)
Data since Tue Mar 19 08:41:31 2019 (1553002891)
*******************************************************
Server thread count: 125
Agent queue size: 566
Agent count: 1
DBD Agent queue size: 0
Jobs submitted: 2582
Jobs started: 67
Jobs completed: 15
Jobs canceled: 1
Jobs failed: 0
Job states ts: Tue Mar 19 08:45:43 2019 (1553003143)
Jobs pending: 21029
Jobs running: 2631
Main schedule statistics (microseconds):
Last cycle: 285357
Max cycle: 2203201
Total cycles: 584
Mean cycle: 425961
Mean depth cycle: 94
Cycles per minute: 146
Last queue length: 5124
Backfilling stats (WARNING: data obtained in the middle of backfilling execution.)
Total backfilled jobs (since last slurm start): 0
Total backfilled jobs (since last stats cycle start): 0
Total backfilled heterogeneous job components: 0
Total cycles: 0
Last cycle when: Tue Mar 19 08:43:29 2019 (1553003009)
Last cycle: 0
Max cycle: 0
Last depth cycle: 6
Last depth cycle (try sched): 6
Last queue length: 5095
Latency for gettimeofday() (x1000): 62 microseconds
Remote Procedure Call statistics by message type
REQUEST_PARTITION_INFO ( 2009) count:2997 ave_time:174 total_time:524289
MESSAGE_NODE_REGISTRATION_STATUS ( 1002) count:747 ave_time:276729 total_time:206717228
REQUEST_STEP_COMPLETE ( 5016) count:709 ave_time:10940057 total_time:7756500748
MESSAGE_EPILOG_COMPLETE ( 6012) count:677 ave_time:10063120 total_time:6812732789
REQUEST_COMPLETE_BATCH_SCRIPT ( 5018) count:624 ave_time:44500618 total_time:27768385782
REQUEST_FED_INFO ( 2049) count:324 ave_time:289 total_time:93799
REQUEST_JOB_INFO_SINGLE ( 2021) count:217 ave_time:4119863 total_time:894010430
REQUEST_JOB_INFO ( 2003) count:147 ave_time:8322547 total_time:1223414458
REQUEST_SUBMIT_BATCH_JOB ( 4003) count:84 ave_time:12051053 total_time:1012288534
REQUEST_JOB_USER_INFO ( 2039) count:22 ave_time:4103390 total_time:90274601
REQUEST_COMPLETE_PROLOG ( 6018) count:16 ave_time:337145 total_time:5394334
REQUEST_NODE_INFO ( 2007) count:13 ave_time:656006 total_time:8528081
REQUEST_KILL_JOB ( 5032) count:1 ave_time:44049166 total_time:44049166
REQUEST_SET_DEBUG_LEVEL ( 1010) count:1 ave_time:40283901 total_time:40283901
REQUEST_STATS_INFO ( 2035) count:0 ave_time:0 total_time:0
Remote Procedure Call statistics by user
root ( 0) count:5550 ave_time:7691721 total_time:42689053217
vuiisccidev ( 5010) count:262 ave_time:1943637 total_time:509233090
vuiis_archive_singularity( 9211) count:255 ave_time:862750 total_time:220001402
cmspilot ( 9201) count:144 ave_time:10075294 total_time:1450842374
ronymw ( 684864) count:132 ave_time:2639717 total_time:348442722
wangj57 ( 674623) count:82 ave_time:1780659 total_time:146014048
reasatt ( 765549) count:44 ave_time:1519411 total_time:66854125
smithmb3 ( 527620) count:44 ave_time:2562072 total_time:112731195
cmslocal ( 9202) count:30 ave_time:6093371 total_time:182801157
ozgunka ( 501421) count:10 ave_time:5713203 total_time:57132033
autocms ( 9209) count:10 ave_time:1239882 total_time:12398827
boydb1 ( 177470) count:6 ave_time:2012292 total_time:12073753
schildjs ( 68906) count:5 ave_time:10915575 total_time:54577877
masapps ( 789320) count:3 ave_time:10341 total_time:31024
lcgadmin ( 9204) count:1 ave_time:450791 total_time:450791
fengl ( 672091) count:1 ave_time:560505 total_time:560505
Pending RPC statistics
REQUEST_TERMINATE_JOB ( 6011) count:452
REQUEST_LAUNCH_PROLOG ( 6017) count:55
REQUEST_BATCH_JOB_LAUNCH ( 4005) count:55
SRUN_STEP_MISSING ( 7007) count:3
REQUEST_NODE_REGISTRATION_STATUS ( 1001) count:1
Pending RPCs
1: REQUEST_TERMINATE_JOB cn349
2: REQUEST_TERMINATE_JOB cn349
3: REQUEST_TERMINATE_JOB cn349
4: REQUEST_TERMINATE_JOB cn349
5: REQUEST_TERMINATE_JOB cn352
6: REQUEST_TERMINATE_JOB cn352
7: REQUEST_TERMINATE_JOB cn352
8: REQUEST_TERMINATE_JOB cn352
9: REQUEST_TERMINATE_JOB cn390
10: REQUEST_TERMINATE_JOB cn390
11: REQUEST_TERMINATE_JOB cn390
12: REQUEST_TERMINATE_JOB cn1092
13: REQUEST_TERMINATE_JOB cn430
14: REQUEST_TERMINATE_JOB cn913
15: REQUEST_TERMINATE_JOB cn215
16: REQUEST_TERMINATE_JOB cn1088
17: REQUEST_TERMINATE_JOB cn1087
18: REQUEST_TERMINATE_JOB cn1088
19: REQUEST_TERMINATE_JOB cn1081
20: REQUEST_TERMINATE_JOB cn1081
21: REQUEST_TERMINATE_JOB cn463
22: REQUEST_TERMINATE_JOB cn1081
23: REQUEST_TERMINATE_JOB cn205
24: REQUEST_TERMINATE_JOB cn1083
25: REQUEST_TERMINATE_JOB cn1081
Nate,
Could the parameters below that we changed last week may be at play here?
> batch_sched_delay=10
> sched_max_job_start=200
> max_rpc_cnt=400
(In reply to Davide Vanzo from comment #127) > Nate, > > Could the parameters below that we changed last week may be at play here? > > > batch_sched_delay=10 > > sched_max_job_start=200 > > max_rpc_cnt=400 Yes, lets lower this one: > max_rpc_cnt=150 It should make the slurmctld more responsive to the job completions RPCs. (In reply to Nate Rini from comment #128) > (In reply to Davide Vanzo from comment #127) > > Nate, > > > > Could the parameters below that we changed last week may be at play here? > > > > > batch_sched_delay=10 > > > sched_max_job_start=200 > > > max_rpc_cnt=400 > > Yes, lets lower this one: > > max_rpc_cnt=150 > > It should make the slurmctld more responsive to the job completions RPCs. We can also consider adding 'defer', which will stop the scheduler running immediately on job submission, which should also help with lock contention. It will result in interactive jobs potentially starting slightly slower, so I suggest testing this change to make sure the interactive start latency is acceptable. (In reply to Nate Rini from comment #128) > Yes, lets lower this one: > > max_rpc_cnt=150 > > It should make the slurmctld more responsive to the job completions RPCs. My worry is that this would worsen the responsiveness to commands like squeue and sbatch. We are still experiencing socket timeout errors with max_rpc_cnt=400. Would you still recommend to lower it to 150? (In reply to Davide Vanzo from comment #130) > (In reply to Nate Rini from comment #128) > > Yes, lets lower this one: > > > max_rpc_cnt=150 > > > > It should make the slurmctld more responsive to the job completions RPCs. > > My worry is that this would worsen the responsiveness to commands like > squeue and sbatch. We are still experiencing socket timeout errors with > max_rpc_cnt=400. Would you still recommend to lower it to 150? This config will cause the scheduler to defer running while there are more than 150 active RPCs being processed. It should not cause new RPCs to be rejected. Which host is 10.0.11.141? (In reply to Nate Rini from comment #132) > Which host is 10.0.11.141? It is ce4.vampire and it is one of the CMS compute elements that are in charge of interfacing our scheduler with their scheduler. It takes care of scheduling and monitoring jobs on the cluster. It is not unusual to have high traffic from them since CMS is the biggest user of our cluster. (In reply to Davide Vanzo from comment #125) > Created attachment 9617 [details] > slurmctld log - Comment #124 Please make sure to turn your log level back down if you haven't already, that log level will slow down Slurm considerably. Oh yeah. I turned the verbosity up only for 10 minutes until I had to restart the controller. The debug level is now back to verbose. After reducing max_rpc_cnt to 150 the responsiveness of the controller is good (no timeouts for now) and I see the jobs being started from the log. So everything seems pretty good for now. Still a lot of very large processing time warnings are present in the log. Ideas? (In reply to Davide Vanzo from comment #136) > After reducing max_rpc_cnt to 150 the responsiveness of the controller is > good (no timeouts for now) and I see the jobs being started from the log. So > everything seems pretty good for now. > Still a lot of very large processing time warnings are present in the log. > Ideas? Yes, we will work on that too. I'm still going through your logs now. Did you attach the debug logs for slurmdbd? I don't see any debug messages from the one just attached. (In reply to Nate Rini from comment #137) > (In reply to Davide Vanzo from comment #136) > > After reducing max_rpc_cnt to 150 the responsiveness of the controller is > > good (no timeouts for now) and I see the jobs being started from the log. So > > everything seems pretty good for now. > > Still a lot of very large processing time warnings are present in the log. > > Ideas? > > Yes, we will work on that too. I'm still going through your logs now. Did > you attach the debug logs for slurmdbd? I don't see any debug messages from > the one just attached. Can you also please attach your current slurm.conf and the slurmctld logs, and another sdiag. > Yes, we will work on that too. I'm still going through your logs now. Did
> you attach the debug logs for slurmdbd? I don't see any debug messages from
> the one just attached.
The slurmdbd logs with increased verbosity are in the Slurm logs - Commen #119 attachment. I will attach a more recent log with standard verbosity right now.
Created attachment 9621 [details] conf + logs - Comment #139 (In reply to Davide Vanzo from comment #141) > Created attachment 9621 [details] > conf + logs - Comment #139 This is looking significantly better: > Jobs submitted: 2702 > Jobs started: 1148 > Jobs failed: 0 > [2019-03-19T13:29:59.044] debug: REQUEST_PERSIST_INIT: CLUSTER:vampire VERSION:8448 UID:762584 IP:10.0.0.76 CONN:13 > [2019-03-19T13:30:00.167] debug: REQUEST_PERSIST_INIT: CLUSTER:vampire VERSION:8448 UID:762584 IP:10.0.0.76 CONN:11 > [2019-03-19T13:30:01.288] debug: REQUEST_PERSIST_INIT: CLUSTER:vampire VERSION:8448 UID:762584 IP:10.0.0.76 CONN:13 Which host is 10.0.0.76? This host is opening a new connection to slurmdbd almost every second. Is there a user running sacctmgr or sacct with it in a loop? Can we add/modify the following to SchedulerParameters in slurm.conf > defer > bf_max_job_test=4000 -> bf_max_job_test=2000 > partition_job_depth=750 We may also need to increase bf_resolution. From the logs, looks like it is just trying to compute too many jobs at once and the locks are starving. > Which host is 10.0.0.76? This host is opening a new connection to slurmdbd > almost every second. Is there a user running sacctmgr or sacct with it in a > loop? That is one of our cluster gateways. We do not have anything on them querying sacct from there, so it is definitely some user process. Is there any way to pinpoint the source? > Can we add/modify the following to SchedulerParameters in slurm.conf > > defer > > bf_max_job_test=4000 -> bf_max_job_test=2000 > > partition_job_depth=750 > > We may also need to increase bf_resolution. From the logs, looks like it is > just trying to compute too many jobs at once and the locks are starving. I will go ahead and modify those parameters. What value would you think is reasonable for bf_resolution? (In reply to Davide Vanzo from comment #143) > > Which host is 10.0.0.76? This host is opening a new connection to slurmdbd > > almost every second. Is there a user running sacctmgr or sacct with it in a > > loop? > > That is one of our cluster gateways. We do not have anything on them > querying sacct from there, so it is definitely some user process. Is there > any way to pinpoint the source? Should be this user (UID:762584): > getent passwd 762584 > > Can we add/modify the following to SchedulerParameters in slurm.conf > > > defer > > > bf_max_job_test=4000 -> bf_max_job_test=2000 > > > partition_job_depth=750 > > > > We may also need to increase bf_resolution. From the logs, looks like it is > > just trying to compute too many jobs at once and the locks are starving. > > I will go ahead and modify those parameters. What value would you think is > reasonable for bf_resolution? Looks like 600 is the max suggested per #6681. We can instead try to increase bf_max_time to 150. Please attach slurmctld logs and sdiag output after the changes have been applied. > Should be this user (UID:762584): He has a couple of watch processes on that gateway that call sacct and squeue once every 10 seconds. Other then that he does not have much else that queries the database. I would be very surprised if this is causing Slurm to overload. > Looks like 600 is the max suggested per #6681. We can instead try to > increase bf_max_time to 150. > > Please attach slurmctld logs and sdiag output after the changes have been > applied. I will sure do. (In reply to Davide Vanzo from comment #145) > He has a couple of watch processes on that gateway that call sacct and > squeue once every 10 seconds. Other then that he does not have much else > that queries the database. I would be very surprised if this is causing > Slurm to overload. I suggest decreasing the call frequency, but this should not be dragging Slurm to a halt either. Since your system is having issues, I want to make sure everything is checked. Created attachment 9624 [details] Comment #144 requested logs > (In reply to Davide Vanzo from comment #147) > > Created attachment 9624 [details] > > Comment #144 requested logs > > [2019-03-19T15:47:26.480] Warning: Note very large processing time from > _slurmctld_background: usec=5989246 began=15:47:20.491 > [2019-03-19T15:47:26.488] Warning: Note very large processing time from > _slurm_rpc_dump_job_user: usec=5841023 began=15:47:20.647 > [2019-03-19T15:47:26.488] Warning: Note very large processing time from > _slurm_rpc_dump_job_user: usec=5871075 began=15:47:20.617 usec=5871075 => 5.8s > [2019-03-19T15:46:46.899] Warning: Note very large processing time from > _slurm_rpc_dump_nodes: usec=3465309 began=15:46:43.433 > [2019-03-19T15:46:46.944] Warning: Note very large processing time from > _slurm_rpc_dump_jobs: usec=3229292 began=15:46:43.715 usec=3229292 => 3.2sec We can try setting these in SchedulerParameters: > bf_yield_sleep=1500000 > bf_yield_interval=550000 Full explanation is in Bug#6688 comment 6. We may need to tweak it more. Please attach logs after it has been set. > [2019-03-19T15:16:32.632] error: gres/gpu: job 7187275 dealloc node gpu0020 > topo gres count underflow (1 2) > [2019-03-19T15:16:32.632] error: gres/gpu: job 7187277 dealloc node gpu0016 > topo gres count underflow (1 2) > [2019-03-19T15:16:32.634] error: gres/gpu: job 7187275 dealloc node gpu0020 > topo gres count underflow (1 2) > [2019-03-19T15:16:32.634] error: gres/gpu: job 7187277 dealloc node gpu0016 > topo gres count underflow (1 2) Can you please open another ticket on this? > Jobs started: 415 > Total backfilled jobs (since last slurm start): 349 Looks like backfill is starting majority of jobs. Can you please provide another set of sdiag outputs to see if that is continuing. (In reply to Davide Vanzo from comment #136) > After reducing max_rpc_cnt to 150 the responsiveness of the controller is > good (no timeouts for now) and I see the jobs being started from the log. So > everything seems pretty good for now. Can we lower the this ticket to SEV3? Created attachment 9637 [details] Comment #149 requested logs (In reply to Davide Vanzo from comment #150) > Created attachment 9637 [details] > Comment #149 requested logs Looks like all the "very large processing time" errors are gone. The REQUEST_PARTITION_INFO RPC is back down to a sane 153ms. Looks like the backfiller is still starting the majority of the jobs but that could just be due to how busy your system is and the long wall clocks. Is this issue resolved for you? > Looks like all the "very large processing time" errors are gone. The > REQUEST_PARTITION_INFO RPC is back down to a sane 153ms. Agree. It looks like we got it. > Looks like the backfiller is still starting the majority of the jobs but > that could just be due to how busy your system is and the long wall clocks. > > Is this issue resolved for you? The cluster resources utilization is at a good 88%. Do you mind keeping it open for a few more days while I keep an eye on it? I will reduce the ticket importance. One more question. I started seeing a bunch of jobs pending with reason "Resources", some of them have been pending for a couple of days now and resources are definitely available. When I look into the slurmctld logs I see messages like: >backfill: Failed to start JobId=7199845_* with cn[201-223,225-238,301-305,307-312,314-327,329-331,333,335,339-340,347-351,353-370,372-376,378,380-387,390-391,393-396,398-417,419-425,427-432,435-460,463-465,468,470-486,489-500,903-905,911-913,1081-1096,1121-1135,1201-1218,1220-1230,1232-1242,1257-1260,1263-1266,1268-1276,1278,1280,1282,1284-1290,1293-1300,1302,1304-1321,1323-1344,1346-1355,1357-1358,1360-1361,1364-1427,1430-1450,1452-1467,1469-1470,1472-1479,1481-1485,1490,1492-1494,1499,1501,1505-1506,1508-1509,1511,1517-1520,1523,1527-1528] avail: Requested nodes are busy We have a couple of reservations scheduled on the cluster. Is it possible that Slurm is trying to assigned for the job a node in the reservation and it does not swap to a node outside a reservation? Here are the reservations currently scheduled: > ReservationName=sc3260 StartTime=2019-04-27T09:00:00 EndTime=2019-05-07T09:00:00 Duration=10-00:00:00 > Nodes=gpu[0032-0034] NodeCnt=3 CoreCnt=24 Features=(null) PartitionName=pascal Flags=SPEC_NODES > TRES=cpu=24 > Users=(null) Accounts=sc3260_acc Licenses=(null) State=INACTIVE BurstBuffer=(null) Watts=n/a > ReservationName=sbcs_res StartTime=2019-03-18T08:00:00 EndTime=2019-04-04T08:00:00 Duration=17-00:00:00 > Nodes=cn[1281,1283,1301,1322,1362-1363,1486-1489,1491,1495-1498,1502-1504,1514-1516,1521-1522,1524,1529] NodeCnt=25 CoreCnt=400 Features=(null) PartitionName=(null) Flags=SPEC_NODES > TRES=cpu=400 > Users=(null) Accounts=sbcs Licenses=(null) State=ACTIVE BurstBuffer=(null) Watts=n/a > ReservationName=oanh StartTime=2019-03-22T09:00:00 EndTime=2019-04-05T09:00:00 Duration=14-00:00:00 > Nodes=cn[313,328,332,334,336-338,352,377,379,388,392,418,433,461-462,466-467,469,487-488,1101,1219,1262,1356,1468,1471,1500,1507,1526] NodeCnt=30 CoreCnt=404 Features=(null) PartitionName=production Flags=SPEC_NODES > TRES=cpu=404 > Users=vuot2 Accounts=(null) Licenses=(null) State=INACTIVE BurstBuffer=(null) Watts=n/a And here are the pending jobs: > 7199845_[1-94] > 7200244_[0-7] > 7200246_[1-7] > 7198374_65 > 7198374_89 > 7240637_[107-437] > 7240861_[1-409] > 7242047 > 7242234_[424-1250] > 7242297_[37-50] > 7242298_[1-41] > 7242633 > 7243470 > 7243870 > 7244590_[13-18,20] > 7244610 I will attach a fresh slurmctld log. Created attachment 9639 [details] slurmctld log - Comment #153 (In reply to Davide Vanzo from comment #153) > > Looks like all the "very large processing time" errors are gone. The > > REQUEST_PARTITION_INFO RPC is back down to a sane 153ms. > Agree. It looks like we got it. > > > Looks like the backfiller is still starting the majority of the jobs but > > that could just be due to how busy your system is and the long wall clocks. > > > > Is this issue resolved for you? > The cluster resources utilization is at a good 88%. Do you mind keeping it > open for a few more days while I keep an eye on it? I will reduce the ticket > importance. Sure, please note that responding to a closed ticket will reopen it. > One more question. I started seeing a bunch of jobs pending with reason > "Resources", some of them have been pending for a couple of days now and > resources are definitely available. When I look into the slurmctld logs I > see messages like: Can you please open another 2 tickets for these issues? This ticket is already hard enough for anyone else to follow. Agree. Go ahead and close it. I will reopen it if I see Slurm not working properly. Thank you for all the help! Davide (In reply to Davide Vanzo from comment #156) > Agree. Go ahead and close it. I will reopen it if I see Slurm not working > properly. > > Thank you for all the help! Happy to help. Closing ticket per your response. --Nate |
Created attachment 9436 [details] Slurm config Hi guys, We are experiencing a very strange issue. The Slurm controller is non-responsive causing send/recv operation timeout failures. After looking into the log we noticed that it reports about 1500 unique job steps as invalid after attempting to send a TERMINATE_JOB request that apparently does not get satisfied. By looking in the slurmd log in the corresponding compute nodes we observed that the daemon is sending SIGKILL signals as requested by the controller but the corresponding slurmstepd process remains alive. This is what I assume is causing the RPC issue. How would you suggest to proceed? Happy to provide all the information you may need. Davide