View | Details | Raw Unified | Return to ticket 7928 | Differences between
and this patch

Collapse All | Expand All

(-)a/src/common/slurm_protocol_api.c (+12 lines)
Lines 4952-4963 List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout) Link Here
4952
	ret_data_info_t *ret_data_info = NULL;
4952
	ret_data_info_t *ret_data_info = NULL;
4953
	ListIterator itr;
4953
	ListIterator itr;
4954
	int i;
4954
	int i;
4955
	char addr_str[32];
4955
4956
4956
	slurm_mutex_lock(&conn_lock);
4957
	slurm_mutex_lock(&conn_lock);
4957
	if (conn_timeout == NO_VAL16)
4958
	if (conn_timeout == NO_VAL16)
4958
		conn_timeout = MIN(slurm_get_msg_timeout(), 10);
4959
		conn_timeout = MIN(slurm_get_msg_timeout(), 10);
4959
	slurm_mutex_unlock(&conn_lock);
4960
	slurm_mutex_unlock(&conn_lock);
4960
4961
4962
	if (msg->msg_type == REQUEST_LAUNCH_PROLOG) {
4963
		slurm_print_slurm_addr(&msg->address, addr_str,
4964
				       sizeof(addr_str));
4965
		debug("BUG7928 %s: pid %d going to connect to %s to send %s",
4966
		      __func__, getpid(), addr_str,
4967
		      rpc_num2string(msg->msg_type));
4968
	}
4961
	/* This connect retry logic permits Slurm hierarchical communications
4969
	/* This connect retry logic permits Slurm hierarchical communications
4962
	 * to better survive slurmd restarts */
4970
	 * to better survive slurmd restarts */
4963
	for (i = 0; i <= conn_timeout; i++) {
4971
	for (i = 0; i <= conn_timeout; i++) {
Lines 4969-4974 List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout) Link Here
4969
		if (i == 0)
4977
		if (i == 0)
4970
			debug3("connect refused, retrying");
4978
			debug3("connect refused, retrying");
4971
	}
4979
	}
4980
	if (msg->msg_type == REQUEST_LAUNCH_PROLOG)
4981
		debug("BUG7928 %s:retries slurm_open_msg_conn=%d, conn_timeout=%d, fd=%d, errno: %s",
4982
		      __func__, i, conn_timeout, fd, slurm_strerror(errno));
4983
4972
	if (fd < 0) {
4984
	if (fd < 0) {
4973
		mark_as_failed_forward(&ret_list, name,
4985
		mark_as_failed_forward(&ret_list, name,
4974
				       SLURM_COMMUNICATIONS_CONNECTION_ERROR);
4986
				       SLURM_COMMUNICATIONS_CONNECTION_ERROR);

Return to ticket 7928