View | Details | Raw Unified | Return to ticket 17401
Collapse All | Expand All

(-)a/src/slurmctld/job_scheduler.c (-3 / +3 lines)
Lines 1281-1292 static int _schedule(bool full_queue) Link Here
1281
	}
1281
	}
1282
	slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
1282
	slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
1283
1283
1284
	lock_slurmctld(job_write_lock);
1285
1284
	if (!fed_mgr_sibs_synced()) {
1286
	if (!fed_mgr_sibs_synced()) {
1287
		unlock_slurmctld(job_write_lock);
1285
		sched_info("schedule() returning, federation siblings not synced yet");
1288
		sched_info("schedule() returning, federation siblings not synced yet");
1286
		goto out;
1289
		goto out;
1287
	}
1290
	}
1288
1291
1289
	lock_slurmctld(job_write_lock);
1290
	now = time(NULL);
1292
	now = time(NULL);
1291
	sched_start = now;
1293
	sched_start = now;
1292
	last_job_sched_start = now;
1294
	last_job_sched_start = now;
1293
- 
1294
--
1295
src/slurmctld/fed_mgr.c | 2 ++
1295
src/slurmctld/fed_mgr.c | 2 ++
1296
1 file changed, 2 insertions(+)
1296
1 file changed, 2 insertions(+)
(-)a/src/slurmctld/fed_mgr.c (-2 / +2 lines)
Lines 6132-6137 extern bool fed_mgr_sibs_synced() Link Here
6132
	slurmdb_cluster_rec_t *sib;
6132
	slurmdb_cluster_rec_t *sib;
6133
	int dummy = 1;
6133
	int dummy = 1;
6134
6134
6135
	xassert(verify_lock(FED_LOCK, READ_LOCK));
6136
6135
	if (!fed_mgr_fed_rec)
6137
	if (!fed_mgr_fed_rec)
6136
		return true;
6138
		return true;
6137
6139
6138
- 
6139
--
6140
src/common/slurm_persist_conn.c | 5 ++++-
6140
src/common/slurm_persist_conn.c | 5 ++++-
6141
src/common/slurm_persist_conn.h | 1 +
6141
src/common/slurm_persist_conn.h | 1 +
6142
2 files changed, 5 insertions(+), 1 deletion(-)
6142
2 files changed, 5 insertions(+), 1 deletion(-)
(-)a/src/common/slurm_persist_conn.c (-1 / +4 lines)
Lines 540-545 extern int slurm_persist_conn_open_without_init( Link Here
540
		_close_fd(&persist_conn->fd);
540
		_close_fd(&persist_conn->fd);
541
	else
541
	else
542
		persist_conn->fd = -1;
542
		persist_conn->fd = -1;
543
	persist_conn->open_time = 0;
543
544
544
	if (!persist_conn->inited)
545
	if (!persist_conn->inited)
545
		persist_conn->inited = true;
546
		persist_conn->inited = true;
Lines 657-663 extern int slurm_persist_conn_open(slurm_persist_conn_t *persist_conn) Link Here
657
				      persist_conn->rem_port);
658
				      persist_conn->rem_port);
658
			}
659
			}
659
			_close_fd(&persist_conn->fd);
660
			_close_fd(&persist_conn->fd);
660
		}
661
		} else
662
			persist_conn->open_time = time(NULL);
661
	}
663
	}
662
664
663
end_it:
665
end_it:
Lines 673-678 extern void slurm_persist_conn_close(slurm_persist_conn_t *persist_conn) Link Here
673
		return;
675
		return;
674
676
675
	_close_fd(&persist_conn->fd);
677
	_close_fd(&persist_conn->fd);
678
	persist_conn->open_time = 0;
676
}
679
}
677
680
678
extern int slurm_persist_conn_reopen(slurm_persist_conn_t *persist_conn,
681
extern int slurm_persist_conn_reopen(slurm_persist_conn_t *persist_conn,
(-)a/src/common/slurm_persist_conn.h (-2 / +1 lines)
Lines 82-87 typedef struct { Link Here
82
	char *rem_host;
82
	char *rem_host;
83
	uint16_t rem_port;
83
	uint16_t rem_port;
84
	time_t *shutdown;
84
	time_t *shutdown;
85
	time_t open_time; /* Time the connection was opened. Do not pack. */
85
	pthread_t thread_id;
86
	pthread_t thread_id;
86
	int timeout;
87
	int timeout;
87
	slurm_trigger_callbacks_t trigger_callbacks;
88
	slurm_trigger_callbacks_t trigger_callbacks;
88
- 
89
--
90
src/slurmctld/fed_mgr.c | 4 ++--
89
src/slurmctld/fed_mgr.c | 4 ++--
91
1 file changed, 2 insertions(+), 2 deletions(-)
90
1 file changed, 2 insertions(+), 2 deletions(-)
(-)a/src/slurmctld/fed_mgr.c (-4 / +2 lines)
Lines 6139-6146 extern bool fed_mgr_sibs_synced() Link Here
6139
6139
6140
	if ((sib = list_find_first(fed_mgr_fed_rec->cluster_list,
6140
	if ((sib = list_find_first(fed_mgr_fed_rec->cluster_list,
6141
				   _list_find_not_synced_sib, &dummy))) {
6141
				   _list_find_not_synced_sib, &dummy))) {
6142
		debug("%s: sibling %s up but not synced yet",
6142
		info("%s: sibling %s up but not synced yet",
6143
		      __func__, sib->name);
6143
		     __func__, sib->name);
6144
		return false;
6144
		return false;
6145
	}
6145
	}
6146
6146
6147
- 
6148
for awhile
6147
for awhile
6149
--
6150
src/slurmctld/fed_mgr.c | 15 +++++++++++++--
6148
src/slurmctld/fed_mgr.c | 15 +++++++++++++--
6151
1 file changed, 13 insertions(+), 2 deletions(-)
6149
1 file changed, 13 insertions(+), 2 deletions(-)
(-)a/src/slurmctld/fed_mgr.c (-3 / +13 lines)
Lines 63-68 Link Here
63
#define FED_MGR_STATE_FILE       "fed_mgr_state"
63
#define FED_MGR_STATE_FILE       "fed_mgr_state"
64
#define FED_MGR_CLUSTER_ID_BEGIN 26
64
#define FED_MGR_CLUSTER_ID_BEGIN 26
65
#define TEST_REMOTE_DEP_FREQ 30 /* seconds */
65
#define TEST_REMOTE_DEP_FREQ 30 /* seconds */
66
#define FED_MGR_WAIT_SIB_SYNC 60 /* seconds */
66
67
67
#define FED_SIBLING_BIT(x) ((uint64_t)1 << (x - 1))
68
#define FED_SIBLING_BIT(x) ((uint64_t)1 << (x - 1))
68
69
Lines 6139-6146 extern bool fed_mgr_sibs_synced() Link Here
6139
6140
6140
	if ((sib = list_find_first(fed_mgr_fed_rec->cluster_list,
6141
	if ((sib = list_find_first(fed_mgr_fed_rec->cluster_list,
6141
				   _list_find_not_synced_sib, &dummy))) {
6142
				   _list_find_not_synced_sib, &dummy))) {
6142
		info("%s: sibling %s up but not synced yet",
6143
		char *close_str = "";
6143
		     __func__, sib->name);
6144
		time_t now = time(NULL);
6145
		slurm_persist_conn_t *conn = sib->fed.send;
6146
6147
		xassert(conn);
6148
6149
		if ((now - conn->open_time) >= FED_MGR_WAIT_SIB_SYNC) {
6150
			close_str = ", closed connection due to timeout.";
6151
			_close_controller_conn(sib);
6152
		}
6153
		info("%s: sibling %s up but not synced yet%s",
6154
		     __func__, sib->name, close_str);
6144
		return false;
6155
		return false;
6145
	}
6156
	}
6146
6157
6147
- 

Return to ticket 17401