Curious if you can prevent slurmctld from core dumping when it has problems connecting to db2. It happened only once so far and restarting the slurmctld went fine: (gdb) where #0 0x0000040000174d60 in .raise () from /lib64/libc.so.6 #1 0x0000040000176cf4 in .abort () from /lib64/libc.so.6 #2 0x0000040000ecf030 in ._ZN9__gnu_cxx27__verbose_terminate_handlerEv () from /usr/lib64/libstdc++.so.6 #3 0x0000040000ecbf54 in ?? () from /usr/lib64/libstdc++.so.6 #4 0x0000040000ecbfa0 in ._ZSt9terminatev () from /usr/lib64/libstdc++.so.6 #5 0x0000040000ecc17c in .__cxa_throw () from /usr/lib64/libstdc++.so.6 #6 0x00000400006a1380 in bgsched::Block::getUsers (blockName=<value optimized out>) at Block.cc:2132 #7 0x00000400004055e0 in bridge_block_sync_users (bg_record=0x40120040540) at bridge_linker.cc:1098 #8 0x00000400003d872c in _start_agent (bg_action_ptr=0x40080036a80) at bg_job_run.c:560 #9 0x00000400003d890c in _block_agent (args=0x40080036a80) at bg_job_run.c:593 #10 0x00000400000dc5dc in .start_thread () from /lib64/libpthread.so.0 #11 0x000004000023a8ec in .__clone () from /lib64/libc.so.6 bt full does its best but enters an endless loop halfway up... (gdb) bt full #0 0x0000040000174d60 in .raise () from /lib64/libc.so.6 No symbol table info available. #1 0x0000040000176cf4 in .abort () from /lib64/libc.so.6 No symbol table info available. #2 0x0000040000ecf030 in ._ZN9__gnu_cxx27__verbose_terminate_handlerEv () from /usr/lib64/libstdc++.so.6 No symbol table info available. #3 0x0000040000ecbf54 in ?? () from /usr/lib64/libstdc++.so.6 No symbol table info available. #4 0x0000040000ecbfa0 in ._ZSt9terminatev () from /usr/lib64/libstdc++.so.6 No symbol table info available. #5 0x0000040000ecc17c in .__cxa_throw () from /usr/lib64/libstdc++.so.6 No symbol table info available. #6 0x00000400006a1380 in bgsched::Block::getUsers (blockName=<value optimized out>) at Block.cc:2132 te_oss_ = <incomplete type> te_msg_ = "Communication error occurred while attempting to connect to database." dbblock = {<BGQDB::DBObj> = {_vptr.DBObj = 0x40001a22f60, _columns = 0, _ind = {0 <repeats 64 times>}}, static BLOCKID_COL = "blockid", static BLOCKID_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static BLOCKID_SIZE = 32, _blockid = "\000\000\000\000\020+\241X\000\000\000\000\241\177\332`\000\000\004\000\241\177\332`\000\000\000\000\000\000\000\000", static NUMCNODES_COL = "numcnodes", static NUMCNODES_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _numcnodes = 0, static NUMIONODES_COL = "numionodes", static NUMIONODES_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _numionodes = 0, static OWNER_COL = "owner", static OWNER_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static OWNER_SIZE = 32, _owner = "\000\000\000J\000\000\004\000\320\002j \000\000\004\000\241\177\333p\000\000\000\000\000\000\000\004\000\000\004\000", static USERNAME_COL = "username", static USERNAME_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static USERNAME_SIZE = 32, _username = "\033\233\354\000\000\000\000\000\000\000\000\000\000\004\000\241\177\325\350\000\000\004\000\241\177\326\060\000\000\004\000\241\177", static ISTORUS_COL = "istorus", static ISTORUS_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static ISTORUS_SIZE = 5, _istorus = "\333p\000\000\000", static SIZEA_COL = "sizea", static SIZEA_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _sizea = 0, static SIZEB_COL = "sizeb", static SIZEB_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _sizeb = 0, static SIZEC_COL = "sizec", static SIZEC_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _sizec = 0, static SIZED_COL = "sized", static SIZED_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _sized = 0, static SIZEE_COL = "sizee", static SIZEE_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _sizee = 0, static DESCRIPTION_COL = "description", static DESCRIPTION_COL_TYPE = BGQDB::DBObj::ColumnType::Varchar, static DESCRIPTION_SIZE = 1024, _description = "\000\000\004\000\241\177\326\000\000\000\004\000\241\177\325\360\000\000\004\000\241\177\326@\000\000\004\000\241\177\371@\000\000\004\000\030\003nP\000\000\000\000\020*G\262\000\000\000\000\000\000\000J\000\000\000\000\000\000\000\001\000\000\004\000\320\002j \000\000\004\000\241\177\333p\000\000\004\000\241\177\342\320\000\000\000\000\000\000\000\000\000\000\004\000\000\030\272\b\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000\a\000\000\004\000\000/2\300\000\000\004\000\200\003j\200\000\000\004\000\000D88\000\000\004\000\241\177\371\020\000\000\000\000\000\020\000\000\000\000\004\000\000\020[\260\000\000\000\000\000\001\000\000\000\000\000\000\000\000\000`\000\000\004\000\030\a\006@\000\000\004\000\241\177\343\300\000\000\000\000\000\000\000`\000\000\004\000\030\a\f\240\000\000\004\000\030\a\f\200\000\000\004\000\241"..., static OPTIONS_COL = "options", static OPTIONS_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static OPTIONS_SIZE = 16, _options = "\000\000\000\020\060\066\020\000\000\004\000\241\177\337\060\000", static STATUS_COL = "status", static STATUS_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static STATUS_SIZE = 1, _status = "F", static ACTION_COL = "action", static ACTION_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static ACTION_SIZE = 1, _action = "\000\177", static STATUSLASTMODIFIED_COL = "statuslastmodified", static STATUSLASTMODIFIED_COL_TYPE = BGQDB::DBObj::ColumnType::Timestamp, _statuslastmodified = "\337\320\000\000\004\000\241\177\362\300\000\000\004\000\241\177\340p\000\000\004\000\000\020\002\b", static MLOADERIMG_COL = "mloaderimg", static MLOADERIMG_COL_TYPE = BGQDB::DBObj::ColumnType::Varchar, static MLOADERIMG_SIZE = 256, _mloaderimg = "\000\000\000\020\060\066\020\000\000\000\000\020\061\271p\000\000\004\000\241\177\337\260\000\000\000\000\020*<\310\000\000\001/\000\036\261\274\000\000\000\000\020*<\330\000\000\004\000\241\177\337\320\000\000\000\000\000\001\000\000\000\000\004\000\241\177\371\020\000\000\004\000\241\177\362\300\000\000\000\000\000=\017\000\000\000\004\000\030\006\225\222\000\000\000\000\020\022\263\374\000\000\000\000\000\000\000\000\000\000\004\000\241p\000\000\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000\a\000\000\004\001pO\334\340\000\000\004\000\200\003j\200\000\000\004\000\000D88\000\000\004\000\241\177\371\020\000\000\000\000\000\020\000\000\000\000\004\000\000\020[\260\000\000\000\000\000\001\000\000\000\000\004\000\241\177\371\020\000\000\004\000\030\000\000 \000\000\000\000\000\000\000\020\000\000\004\001 \003\344\000\000\000\004\000\000\362\244\330\000\000\000\000\000\000\000"..., static NODECONFIG_COL = "nodeconfig", static NODECONFIG_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static NODECONFIG_SIZE = 32, _nodeconfig = "\004\000\000\034JL\000\000\004\000\000/\th\000\000\004\000\241\177\340\320\000\000\004\000\241\177\341 \000\000\004", static BOOTOPTIONS_COL = "bootoptions", static BOOTOPTIONS_COL_TYPE = BGQDB::DBObj::ColumnType::Varchar, static BOOTOPTIONS_SIZE = 256, _bootoptions = "\000\241\177\341 $\000$(\377\377\217\200", '\000' <repeats 15 times>, "\001\000\000\000\000\000\t\367\331\000\000\004\000\241\177\371 \000\000\004\000\241\177\341\020", '\000' <repeats 18 times>, "\004\000\241\177\362\300\000\000\000\000\000\000\000\020\000\000\004\001 \003\344\000\000\000\004\000\000\362\244\330\000\000\000\000\000\000\000)\000\000\004\000\241\177\341p\377\377\377\377\377\377\377\377\000\000\004\000\000\354\311P\000\000\004\000\241\177\362\300\000\000\000\000\000\000\000\000\000\000\004\000\000\363LH\000\000\000\000\000\000\000K", '\000' <repeats 15 times>, "\f\000\000\004\000\241\177\341\060\000\000\004\000\241\177\341\260\000\000\000\022\241\177\333\060\000\000\004\000\241\177\334 \000\000\004\000\241\177\334\020\000\000\004\000\241\177\341\340\000\000\004\001 \003\344\000\000\000\004\000\000\362\230\300\000\000\000\000\000\000\000\020\000\000\004", static CREATEDATE_COL = "createdate", static CREATEDATE_COL_TYPE = BGQDB::DBObj::ColumnType::Timestamp, _createdate = "\241\177\341\360\000\000\004\000\241\177\334<\000\000\004\000\000\351Y\324\000\000\004\000\241\177\371", static SECURITYKEY_COL = "securitykey", static SECURITYKEY_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static SECURITYKEY_SIZE = 32, _securitykey = "\020\000\000\000\000\000\000\000\000\000\000\004\000\000\363LH\000\000\004\000\241\177\334\060\000\000\004\000\241\177\341", static ERRTEXT_COL = "errtext", static ERRTEXT_COL_TYPE = BGQDB::DBObj::ColumnType::Varchar, static ERRTEXT_SIZE = 256, _errtext = "\260\000\000\004\000\241\177\342\060\000\000\004\000\241\177\341\300\000\000\004\000\000@\331\\\000\000\004\000\n(\000\000\000\000\000\000\000\000\000K\000\000\004\000\000D\362\360\000\000\004\000\030\a\a8\000\000\004\001 \003\344\000\000\000\004\000\241\177\342\220\000\000\004\000\000D88\000\000\004\000\000\351lx\000\000\004\000\000/2\300\000\000\004\000\000\020[\260\000\000\004\000\000\363LH\000\000\000\000\000\000\t\177\000\000\004\000\241\177\342\060\000\000\004\000\241\177\342\260\000\000\000e\000\000\000\000\000\000\004\000\000@\250\334", '\000' <repeats 12 times>, "\020\022\263\374\000\000\004\000\000D\362\360\000\000\004\000\241\177\343\310\000\000\004\000\241\177\362\300\000\000\000\000\000=\017\000\000\000\004\001 \003\344\000\000\000\004\000\000D\252 \000\000\004\000\241\177\344\330\000\000\004\000\241\177\343\060\000\000\000\000\000\020\000\000\000\000"..., static SEQID_COL = "seqid", static SEQID_COL_TYPE = BGQDB::DBObj::ColumnType::Bigint, _seqid = 0, static CREATIONID_COL = "creationid", static CREATIONID_COL_TYPE = BGQDB::DBObj::ColumnType::Integer, _creationid = 1024, static QUALIFIER_COL = "qualifier", static QUALIFIER_COL_TYPE = BGQDB::DBObj::ColumnType::Char, static QUALIFIER_SIZE = 32, _qualifier = "\000\363LH\000\000\004\000\000@\200\b\000\000\004\000\000\020\002\b\000\000\000\000\020\060\071\060\000\000\004\000"} sqlrc = <value optimized out> __PRETTY_FUNCTION__ = "static std::vector<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > > bgsched::Block::get"... tx = {<boost::noncopyable_::noncopyable> = {<No data fields>}, _dbcon = {px = 0x0, pn = {pi_ = 0x0}}, _hstmt = 0x0, _autocommit = true, _do_rollback = false} __FUNCTION__ = "getUsers" users = std::vector of length -549723221063, capacity 336933265 = { , , , , , Traceback (most recent call last): File "/usr/lib64/../share/gdb/python/libstdcxx/v6/printers.py", line 558, in to_string return self.val['_M_dataplus']['_M_p'].lazy_string (length = len) RuntimeError: Cannot access memory at address 0xffffffffffffffe8 , Traceback (most recent call last): File "/usr/lib64/../share/gdb/python/libstdcxx/v6/printers.py", line 558, in to_string The slurmctld's last words were: [2015-11-19T11:20:34.730] error: getJobs: Can't access to the database! [2015-11-19T11:20:53.796] error: getJobs: Can't access to the database! [2015-11-19T11:21:12.889] error: getJobs: Can't access to the database! [2015-11-19T11:21:31.956] error: getJobs: Can't access to the database! [2015-11-19T11:21:51.031] error: getJobs: Can't access to the database!
Created attachment 2441 [details] Patch to handle database throw for get users Looks like you may be in luck Don :). The attached patch should fix this for 14.11. It is committed in 15.08 at 2ded13c6a9c919. I also fixed that wording in the error printed in the log to use better English ;), commit 0cf8906abb6. It will probably be hard to test this, but it should fix the issue.
Please reopen if you find this doesn't handle the situation.
Thanks! We'll give it a try.