|
Lines 4946-4952
List slurm_send_recv_msgs(const char *nodelist, slurm_msg_t *msg,
Link Here
|
| 4946 |
List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout) |
4946 |
List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout) |
| 4947 |
{ |
4947 |
{ |
| 4948 |
static pthread_mutex_t conn_lock = PTHREAD_MUTEX_INITIALIZER; |
4948 |
static pthread_mutex_t conn_lock = PTHREAD_MUTEX_INITIALIZER; |
| 4949 |
static uint16_t conn_timeout = NO_VAL16; |
4949 |
static uint16_t conn_timeout = NO_VAL16, tcp_timeout = 2; |
|
|
4950 |
char addrbuf[32]; |
| 4950 |
List ret_list = NULL; |
4951 |
List ret_list = NULL; |
| 4951 |
int fd = -1; |
4952 |
int fd = -1; |
| 4952 |
ret_data_info_t *ret_data_info = NULL; |
4953 |
ret_data_info_t *ret_data_info = NULL; |
|
Lines 4954-4975
List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout)
Link Here
|
| 4954 |
int i; |
4955 |
int i; |
| 4955 |
|
4956 |
|
| 4956 |
slurm_mutex_lock(&conn_lock); |
4957 |
slurm_mutex_lock(&conn_lock); |
| 4957 |
if (conn_timeout == NO_VAL16) |
4958 |
if (conn_timeout == NO_VAL16) { |
| 4958 |
conn_timeout = MIN(slurm_get_msg_timeout(), 10); |
4959 |
conn_timeout = MIN(slurm_get_msg_timeout(), 10); |
|
|
4960 |
tcp_timeout = MAX(0, slurm_get_tcp_timeout() - 1); |
| 4961 |
} |
| 4959 |
slurm_mutex_unlock(&conn_lock); |
4962 |
slurm_mutex_unlock(&conn_lock); |
|
|
4963 |
slurm_print_slurm_addr(&msg->address, addrbuf, sizeof(addrbuf)); |
| 4960 |
|
4964 |
|
| 4961 |
/* This connect retry logic permits Slurm hierarchical communications |
4965 |
/* This connect retry logic permits Slurm hierarchical communications |
| 4962 |
* to better survive slurmd restarts */ |
4966 |
* to better survive slurmd restarts */ |
| 4963 |
for (i = 0; i <= conn_timeout; i++) { |
4967 |
for (i = 0; i <= conn_timeout; i++) { |
| 4964 |
if (i) |
|
|
| 4965 |
sleep(1); |
| 4966 |
fd = slurm_open_msg_conn(&msg->address); |
4968 |
fd = slurm_open_msg_conn(&msg->address); |
| 4967 |
if ((fd >= 0) || (errno != ECONNREFUSED)) |
4969 |
if ((fd >= 0) || (errno != ECONNREFUSED && errno != ETIMEDOUT)) |
| 4968 |
break; |
4970 |
break; |
| 4969 |
if (i == 0) |
4971 |
|
| 4970 |
debug3("connect refused, retrying"); |
4972 |
if (errno == ETIMEDOUT) { |
|
|
4973 |
if (i == 0) |
| 4974 |
verbose("Timed out connecting to %s, retrying...", |
| 4975 |
addrbuf); |
| 4976 |
i += tcp_timeout; |
| 4977 |
} else { |
| 4978 |
if (i == 0) |
| 4979 |
verbose("Connection refused by %s, retrying...", |
| 4980 |
addrbuf); |
| 4981 |
sleep(1); |
| 4982 |
} |
| 4971 |
} |
4983 |
} |
| 4972 |
if (fd < 0) { |
4984 |
if (fd < 0) { |
|
|
4985 |
info("Failed to connect to %s, %m", addrbuf); |
| 4973 |
mark_as_failed_forward(&ret_list, name, |
4986 |
mark_as_failed_forward(&ret_list, name, |
| 4974 |
SLURM_COMMUNICATIONS_CONNECTION_ERROR); |
4987 |
SLURM_COMMUNICATIONS_CONNECTION_ERROR); |
| 4975 |
errno = SLURM_COMMUNICATIONS_CONNECTION_ERROR; |
4988 |
errno = SLURM_COMMUNICATIONS_CONNECTION_ERROR; |
| 4976 |
- |
|
|
| 4977 |
2*MessageTimeout |
4989 |
2*MessageTimeout |
| 4978 |
-- |
|
|
| 4979 |
src/slurmd/slurmd/req.c | 13 ++++++++----- |
4990 |
src/slurmd/slurmd/req.c | 13 ++++++++----- |
| 4980 |
1 file changed, 8 insertions(+), 5 deletions(-) |
4991 |
1 file changed, 8 insertions(+), 5 deletions(-) |