View | Details | Raw Unified | Return to ticket 7928 | Differences between
and this patch

Collapse All | Expand All

(-)a/src/common/slurm_protocol_api.c (-9 / +20 lines)
Lines 4946-4952 List slurm_send_recv_msgs(const char *nodelist, slurm_msg_t *msg, Link Here
4946
List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout)
4946
List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout)
4947
{
4947
{
4948
	static pthread_mutex_t conn_lock = PTHREAD_MUTEX_INITIALIZER;
4948
	static pthread_mutex_t conn_lock = PTHREAD_MUTEX_INITIALIZER;
4949
	static uint16_t conn_timeout = NO_VAL16;
4949
	static uint16_t conn_timeout = NO_VAL16, tcp_timeout = 2;
4950
	char addrbuf[32];
4950
	List ret_list = NULL;
4951
	List ret_list = NULL;
4951
	int fd = -1;
4952
	int fd = -1;
4952
	ret_data_info_t *ret_data_info = NULL;
4953
	ret_data_info_t *ret_data_info = NULL;
Lines 4954-4975 List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout) Link Here
4954
	int i;
4955
	int i;
4955
4956
4956
	slurm_mutex_lock(&conn_lock);
4957
	slurm_mutex_lock(&conn_lock);
4957
	if (conn_timeout == NO_VAL16)
4958
	if (conn_timeout == NO_VAL16) {
4958
		conn_timeout = MIN(slurm_get_msg_timeout(), 10);
4959
		conn_timeout = MIN(slurm_get_msg_timeout(), 10);
4960
		tcp_timeout = MAX(0, slurm_get_tcp_timeout() - 1);
4961
	}
4959
	slurm_mutex_unlock(&conn_lock);
4962
	slurm_mutex_unlock(&conn_lock);
4963
	slurm_print_slurm_addr(&msg->address, addrbuf, sizeof(addrbuf));
4960
4964
4961
	/* This connect retry logic permits Slurm hierarchical communications
4965
	/* This connect retry logic permits Slurm hierarchical communications
4962
	 * to better survive slurmd restarts */
4966
	 * to better survive slurmd restarts */
4963
	for (i = 0; i <= conn_timeout; i++) {
4967
	for (i = 0; i <= conn_timeout; i++) {
4964
		if (i)
4965
			sleep(1);
4966
		fd = slurm_open_msg_conn(&msg->address);
4968
		fd = slurm_open_msg_conn(&msg->address);
4967
		if ((fd >= 0) || (errno != ECONNREFUSED))
4969
		if ((fd >= 0) || (errno != ECONNREFUSED && errno != ETIMEDOUT))
4968
			break;
4970
			break;
4969
		if (i == 0)
4971
4970
			debug3("connect refused, retrying");
4972
		if (errno == ETIMEDOUT) {
4973
			if (i == 0)
4974
				verbose("Timed out connecting to %s, retrying...",
4975
					addrbuf);
4976
			i += tcp_timeout;
4977
		} else {
4978
			if (i == 0)
4979
				verbose("Connection refused by %s, retrying...",
4980
				addrbuf);
4981
			sleep(1);
4982
		}
4971
	}
4983
	}
4972
	if (fd < 0) {
4984
	if (fd < 0) {
4985
		info("Failed to connect to %s, %m", addrbuf);
4973
		mark_as_failed_forward(&ret_list, name,
4986
		mark_as_failed_forward(&ret_list, name,
4974
				       SLURM_COMMUNICATIONS_CONNECTION_ERROR);
4987
				       SLURM_COMMUNICATIONS_CONNECTION_ERROR);
4975
		errno = SLURM_COMMUNICATIONS_CONNECTION_ERROR;
4988
		errno = SLURM_COMMUNICATIONS_CONNECTION_ERROR;
4976
- 
4977
2*MessageTimeout
4989
2*MessageTimeout
4978
--
4979
src/slurmd/slurmd/req.c | 13 ++++++++-----
4990
src/slurmd/slurmd/req.c | 13 ++++++++-----
4980
1 file changed, 8 insertions(+), 5 deletions(-)
4991
1 file changed, 8 insertions(+), 5 deletions(-)
(-)a/src/slurmd/slurmd/req.c (-7 / +8 lines)
Lines 2416-2429 _rpc_batch_job(slurm_msg_t *msg, bool new_msg) Link Here
2416
			/*
2416
			/*
2417
			 * This race should only happen for at most a second as
2417
			 * This race should only happen for at most a second as
2418
			 * we are only waiting for the other rpc to get here.
2418
			 * we are only waiting for the other rpc to get here.
2419
			 * If we are waiting for more that 50 trys something bad
2419
			 * We should wait here for msg_timeout * 2, in case of
2420
			 * happened.
2420
			 * REQUEST_LAUNCH_PROLOG lost in forwarding tree the
2421
			 * direct retry from slurmctld will happen after
2422
			 * MessageTimeout.
2421
			 */
2423
			 */
2422
			if (retry_cnt > 50) {
2424
			if (retry_cnt > slurmctld_conf.msg_timeout * 2) {
2423
				rc = ESLURMD_PROLOG_FAILED;
2425
				rc = ESLURMD_PROLOG_FAILED;
2424
				slurm_mutex_unlock(&prolog_mutex);
2426
				slurm_mutex_unlock(&prolog_mutex);
2425
				error("Waiting for JobId=%u prolog has failed, giving up after 50 sec",
2427
				error("Waiting for JobId=%u REQUEST_LAUNCH_PROLOG notification failed, giving up after %"PRIu16" sec",
2426
				      req->job_id);
2428
				      req->job_id,
2429
				      slurmctld_conf.msg_timeout * 2);
2427
				goto done;
2430
				goto done;
2428
			}
2431
			}
2429
2432
2430
- 
2431
--
2432
src/common/slurm_protocol_api.c | 2 +-
2433
src/common/slurm_protocol_api.c | 2 +-
2433
1 file changed, 1 insertion(+), 1 deletion(-)
2434
1 file changed, 1 insertion(+), 1 deletion(-)
(-)a/src/common/slurm_protocol_api.c (-3 / +1 lines)
Lines 4956-4962 List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout) Link Here
4956
4956
4957
	slurm_mutex_lock(&conn_lock);
4957
	slurm_mutex_lock(&conn_lock);
4958
	if (conn_timeout == NO_VAL16) {
4958
	if (conn_timeout == NO_VAL16) {
4959
		conn_timeout = MIN(slurm_get_msg_timeout(), 10);
4959
		conn_timeout = MAX(5, slurm_get_msg_timeout() / 2);
4960
		tcp_timeout = MAX(0, slurm_get_tcp_timeout() - 1);
4960
		tcp_timeout = MAX(0, slurm_get_tcp_timeout() - 1);
4961
	}
4961
	}
4962
	slurm_mutex_unlock(&conn_lock);
4962
	slurm_mutex_unlock(&conn_lock);
4963
- 
4964
connections for >5s
4963
connections for >5s
4965
--
4966
src/slurmd/slurmd/slurmd.c | 9 ++++++++-
4964
src/slurmd/slurmd/slurmd.c | 9 ++++++++-
4967
1 file changed, 8 insertions(+), 1 deletion(-)
4965
1 file changed, 8 insertions(+), 1 deletion(-)
(-)a/src/slurmd/slurmd/slurmd.c (-2 / +8 lines)
Lines 454-468 _msg_engine(void) Link Here
454
	while (!_shutdown) {
454
	while (!_shutdown) {
455
		if (_reconfig) {
455
		if (_reconfig) {
456
			int rpc_wait = MAX(5, slurm_get_msg_timeout() / 2);
456
			int rpc_wait = MAX(5, slurm_get_msg_timeout() / 2);
457
			DEF_TIMERS;
458
			START_TIMER;
457
			verbose("got reconfigure request");
459
			verbose("got reconfigure request");
458
			/* Wait for RPCs to finish */
460
			/* Wait for RPCs to finish */
459
			_wait_for_all_threads(rpc_wait);
461
			_wait_for_all_threads(rpc_wait);
460
			if (_shutdown)
462
			if (_shutdown)
461
				break;
463
				break;
462
			_reconfigure();
464
			_reconfigure();
465
			END_TIMER3("_reconfigure request",5000000);
463
		}
466
		}
464
		if (_update_log)
467
		if (_update_log) {
468
			DEF_TIMERS;
469
			START_TIMER;
465
			_update_logging();
470
			_update_logging();
471
			END_TIMER3("_uplodate_log request",5000000);
472
		}
466
		cli = xmalloc (sizeof (slurm_addr_t));
473
		cli = xmalloc (sizeof (slurm_addr_t));
467
		if ((sock = slurm_accept_msg_conn(conf->lfd, cli)) >= 0) {
474
		if ((sock = slurm_accept_msg_conn(conf->lfd, cli)) >= 0) {
468
			_handle_connection(sock, cli);
475
			_handle_connection(sock, cli);
469
- 

Return to ticket 7928