Ticket 7696 - assert in bit_nclear()
Summary: assert in bit_nclear()
Status: RESOLVED DUPLICATE of ticket 6976
Alias: None
Product: Slurm
Classification: Unclassified
Component: slurmd (show other tickets)
Version: 18.08.3
Hardware: Linux Linux
: 4 - Minor Issue
Assignee: Director of Support
QA Contact:
URL:
Depends on:
Blocks:
 
Reported: 2019-09-05 11:52 MDT by Ben Matthews
Modified: 2019-09-05 12:02 MDT (History)
0 users

See Also:
Site: UCAR
Slinky Site: ---
Alineos Sites: ---
Atos/Eviden Sites: ---
Confidential Site: ---
Coreweave sites: ---
Cray Sites: ---
DS9 clusters: ---
Google sites: ---
HPCnow Sites: ---
HPE Sites: ---
IBM Sites: ---
NOAA SIte: ---
NoveTech Sites: ---
Nvidia HWinf-CS Sites: ---
OCF Sites: ---
Recursion Pharma Sites: ---
SFW Sites: ---
SNIC sites: ---
Tzag Elita Sites: ---
Linux Distro: ---
Machine Name:
CLE Version:
Version Fixed:
Target Release: ---
DevPrio: ---
Emory-Cloud Sites: ---


Attachments

Note You need to log in before you can comment on or make changes to this ticket.
Description Ben Matthews 2019-09-05 11:52:39 MDT
It seems like this may already be fixed, but I'll let you guys be the judge:

[root@casper21 /]# gdb /usr/local/sbin/slurmd core-casper21-0-6368-6
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7_4.1
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /usr/local/sbin/slurmd...done.
[New LWP 92584]
[New LWP 6368]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `/usr/local/sbin/slurmd -f /etc/slurm/slurm.conf -DD'.
Program terminated with signal 6, Aborted.
#0  0x00002b3acf1f61f7 in raise () from /lib64/libc.so.6
Missing separate debuginfos, use: debuginfo-install audit-libs-2.7.6-3.el7.x86_64 glibc-2.17-196.el7_4.2.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-8.el7.x86_64 libcap-ng-0.7.5-4.el7.x86_64 libcom_err-1.42.9-10.el7.x86_64 libgcc-4.8.5-16.el7_4.2.x86_64 libselinux-2.5-11.el7.x86_64 numactl-libs-2.0.9-6.el7_2.x86_64 openssl-libs-1.0.2k-8.el7.x86_64 pam-1.1.8-18.el7.x86_64 pcre-8.32-17.el7.x86_64 sssd-client-1.15.2-50.el7_4.11.x86_64 zlib-1.2.7-17.el7.x86_64
(gdb) set pagination off
(gdb) thread apply all bt full

Thread 2 (Thread 0x2b3acd7552c0 (LWP 6368)):
#0  0x00002b3acefb398d in accept () from /lib64/libpthread.so.0
No symbol table info available.
#1  0x00002b3acde55255 in slurm_accept_msg_conn (fd=<optimized out>, addr=<optimized out>) at slurm_protocol_socket_implementation.c:452
        len = 16
#2  0x000000000040f7d6 in _msg_engine () at slurmd.c:440
        cli = 0x25a4990
        sock = <optimized out>
#3  main (argc=4, argv=0x7ffeacf2ff58) at slurmd.c:376
        i = <optimized out>
        pidfd = 5
        blocked_signals = {13, 0}
        cc = <optimized out>
        oom_value = <optimized out>
        slurmd_uid = <optimized out>
        curr_uid = <optimized out>
        time_stamp = "Mon, 12 Aug 2019 14:58:44 -0600\000\377\377\377\377\377\377\377\377", '\000' <repeats 16 times>, "l\277<\253\377\037\000\000\337/1\316:+\000\000\000\000\000\000\000\000\000\000\200\376\362\254\376\177\000\000\320%U\002\000\000\000\000\260\375\362\254\376\177\000\000\065&1\316:+\000\000\003\000\000\000:+\000\000\360\001\000\000\000\000\000\000\340\001\000\000\000\000\000\000R\027$\317:+\000\000`)U\002\000\000\000\000\000\001", '\000' <repeats 14 times>, "P)U\002\000\000\000\000`\327W\317:+\000\000\260\006\002", '\000' <repeats 13 times>...
        __func__ = "main"

Thread 1 (Thread 0x2b3ad3f53700 (LWP 92584)):
#0  0x00002b3acf1f61f7 in raise () from /lib64/libc.so.6
No symbol table info available.
#1  0x00002b3acf1f78e8 in abort () from /lib64/libc.so.6
No symbol table info available.
#2  0x00002b3acf1ef266 in __assert_fail_base () from /lib64/libc.so.6
No symbol table info available.
#3  0x00002b3acf1ef312 in __assert_fail () from /lib64/libc.so.6
No symbol table info available.
#4  0x00002b3acddb0651 in bit_nclear (b=b@entry=0x2b3b640180e0, start=start@entry=0, stop=stop@entry=-1) at bitstring.c:292
        __PRETTY_FUNCTION__ = "bit_nclear"
#5  0x00002b3acddb2b4b in bit_unfmt_hexmask (bitmap=0x2b3b640180e0, str=<optimized out>) at bitstring.c:1395
        bit_index = 0
        len = <optimized out>
        rc = 0
        curpos = 0x2b3b640185b1 "x"
        current = <optimized out>
        bitsize = 0
#6  0x00002b3acddc9c87 in gres_plugin_job_state_unpack (gres_list=gres_list@entry=0x2b3b64000db0, buffer=buffer@entry=0x2b3b640028d0, job_id=3342749, protocol_version=protocol_version@entry=8448) at gres.c:4269
        tmp_str = 0x2b3b640185b0 "0x"
        _size = 0
        _tmp_uint32 = 3
        i = 0
        rc = 0
        magic = 1133130964
        plugin_id = 7696487
        utmp32 = 1
        rec_cnt = 0
        has_more = 1 '\001'
        gres_ptr = <optimized out>
        gres_job_ptr = 0x2b3b640147d0
        __func__ = "gres_plugin_job_state_unpack"
#7  0x00002b3acde10942 in slurm_cred_unpack (buffer=buffer@entry=0x2b3b640028d0, protocol_version=protocol_version@entry=8448) at slurm_cred.c:1309
        cred_uid = 24712
        cred_gid = 1000
        u32_ngids = 0
        len = 0
        cred = 0x2b3b64000d30
        bit_fmt_str = 0x0
        sigp = <optimized out>
        tot_core_cnt = 11066
        __func__ = "slurm_cred_unpack"
#8  0x00002b3acde3bc9b in _unpack_prolog_launch_msg (msg=msg@entry=0x2b3b64017ca8, buffer=0x2b3b640028d0, protocol_version=<optimized out>) at slurm_protocol_pack.c:11538
        uint32_tmp = 0
        launch_msg_ptr = 0x2b3b640011e0
        __func__ = "_unpack_prolog_launch_msg"
#9  0x00002b3acde5209b in unpack_msg (msg=msg@entry=0x2b3b64017c70, buffer=buffer@entry=0x2b3b640028d0) at slurm_protocol_pack.c:1979
        rc = 0
#10 0x00002b3acde1c4ee in slurm_receive_msg_and_forward (fd=7, orig_addr=<optimized out>, msg=msg@entry=0x2b3b64017c70, timeout=<optimized out>, timeout@entry=0) at slurm_protocol_api.c:3806
        buf = 0x2b3b640064e0 "!"
        buflen = 1031
        header = {version = 8448, flags = 0, msg_index = 0, msg_type = 6017, body_length = 842, ret_cnt = 0, forward = {cnt = 0, init = 65534, nodelist = 0x0, timeout = 0, tree_width = 0}, orig_addr = {sin_family = 2, sin_port = 24297, sin_addr = {s_addr = 3451221376}, sin_zero = "\000\000\000\000\000\000\000"}, ret_list = 0x0}
        rc = 0
        auth_cred = 0x2b3b640027a0
        buffer = 0x2b3b640028d0
        __func__ = "slurm_receive_msg_and_forward"
#11 0x000000000040c90d in _service_connection (arg=<optimized out>) at slurmd.c:535
        con = 0x256f150
        msg = 0x2b3b64017c70
        __func__ = "_service_connection"
        rc = 0
#12 0x00002b3aceface25 in start_thread () from /lib64/libpthread.so.0
No symbol table info available.
#13 0x00002b3acf2b934d in clone () from /lib64/libc.so.6
No symbol table info available.
(gdb)

Let me know if you need logs/any other diagnostics
Comment 1 Jason Booth 2019-09-05 12:02:15 MDT
Hi Ben - This looks like a duplicate of 6976. The patch for this will land in 20.02. There was a workaround put in and is mentioned in 6976 that points back to 6739 where the issues was originally reported.

 For now, I am making this as a duplicate.

*** This ticket has been marked as a duplicate of ticket 6976 ***