View | Details | Raw Unified | Return to ticket 3318 | Differences between
and this patch

Collapse All | Expand All

(-)a/configure.ac (+1 lines)
Lines 419-424 AC_CONFIG_FILES([Makefile Link Here
419
		 contribs/Makefile
419
		 contribs/Makefile
420
		 contribs/cray/Makefile
420
		 contribs/cray/Makefile
421
		 contribs/cray/csm/Makefile
421
		 contribs/cray/csm/Makefile
422
		 contribs/cray/slurmsmwd/Makefile
422
		 contribs/lua/Makefile
423
		 contribs/lua/Makefile
423
		 contribs/mic/Makefile
424
		 contribs/mic/Makefile
424
		 contribs/pam/Makefile
425
		 contribs/pam/Makefile
(-)a/contribs/cray/Makefile.am (-2 / +3 lines)
Lines 2-10 Link Here
2
# Makefile for cray tools
2
# Makefile for cray tools
3
#
3
#
4
SUBDIRS = csm
4
SUBDIRS = csm slurmsmwd
5
AUTOMAKE_OPTIONS = foreign
5
AUTOMAKE_OPTIONS = foreign
6
sbin_PROGRAMS =
6
if HAVE_NATIVE_CRAY
7
if HAVE_NATIVE_CRAY
7
sbin_SCRIPTS = slurmconfgen.py
8
sbin_SCRIPTS = slurmconfgen.py
Lines 18-24 AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common $(JSON_CPPFLAGS) Link Here
18
if WITH_JSON_PARSER
19
if WITH_JSON_PARSER
19
convenience_libs = $(top_builddir)/src/api/libslurm.o $(DL_LIBS)
20
convenience_libs = $(top_builddir)/src/api/libslurm.o $(DL_LIBS)
20
sbin_PROGRAMS = capmc_suspend capmc_resume
21
sbin_PROGRAMS += capmc_suspend capmc_resume
21
capmc_suspend_SOURCES  = capmc_suspend.c
22
capmc_suspend_SOURCES  = capmc_suspend.c
22
capmc_suspend_LDADD    = $(convenience_libs)
23
capmc_suspend_LDADD    = $(convenience_libs)
23
capmc_suspend_LDFLAGS  = -export-dynamic $(JSON_LDFLAGS)
24
capmc_suspend_LDFLAGS  = -export-dynamic $(JSON_LDFLAGS)
(-)a/contribs/cray/slurmsmwd/Makefile.am (+35 lines)
Line 0 Link Here
1
#
2
# Makefile for cray tools
3
#
4
5
AUTOMAKE_OPTIONS = foreign
6
7
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common $(JSON_CPPFLAGS)
8
9
sbin_PROGRAMS = slurmsmwd
10
slurmsmwd_SOURCES	= main.c read_config.c
11
slurmsmwd_LDADD = 					\
12
	$(top_builddir)/src/common/libdaemonize.la \
13
	$(top_builddir)/src/api/libslurm.o $(DL_LIBS)
14
slurmsmwd_LDFLAGS = -export-dynamic $(CMD_LDFLAGS)
15
16
17
ETC_FILES = slurmsmwd.service
18
19
CLEANFILES = $(ETC_FILES)
20
21
edit = sed \
22
        -e 's|@bindir[@]|$(bindir)|g' \
23
        -e 's|@libdir[@]|$(libdir)|g' \
24
        -e 's|@sbindir[@]|$(sbindir)|g' \
25
        -e 's|@sysconfdir[@]|$(sysconfdir)|g' \
26
        -e 's|@BLUEGENE_LOADED_FALSE[@]|$(BLUEGENE_LOADED_FALSE)|g'
27
28
noinst_DATA = $(ETC_FILES)
29
30
slurmsmwd.service: Makefile $(srcdir)/slurmsmwd.service.in
31
	$(edit) $(srcdir)/slurmsmwd.service.in > slurmsmwd.service
32
33
force:
34
$(slurmsmwd_LDADD) : force
35
	@cd `dirname $@` && $(MAKE) `basename $@`
(-)a/contribs/cray/slurmsmwd/main.c (+616 lines)
Line 0 Link Here
1
/*****************************************************************************\
2
 *  main.c - Primary logic for slurmsmwd
3
 *****************************************************************************
4
 *  Copyright (C) 2017 Regents of the University of California
5
 *  Written by Douglas Jacobsen <dmjacobsen@lbl.gov>
6
 *
7
 *  This file is part of SLURM, a resource management program.
8
 *  For details, see <https://slurm.schedmd.com>.
9
 *  Please also read the included file: DISCLAIMER.
10
 *
11
 *  SLURM is free software; you can redistribute it and/or modify it under
12
 *  the terms of the GNU General Public License as published by the Free
13
 *  Software Foundation; either version 2 of the License, or (at your option)
14
 *  any later version.
15
 *
16
 *  In addition, as a special exception, the copyright holders give permission
17
 *  to link the code of portions of this program with the OpenSSL library under
18
 *  certain conditions as described in each individual source file, and
19
 *  distribute linked combinations including the two. You must obey the GNU
20
 *  General Public License in all respects for all of the code used other than
21
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
22
 *  exception to your version of the file(s), but you are not obligated to do
23
 *  so. If you do not wish to do so, delete this exception statement from your
24
 *  version.  If you delete this exception statement from all source files in
25
 *  the program, then also delete it here.
26
 *
27
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
28
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
30
 *  details.
31
 *
32
 *  You should have received a copy of the GNU General Public License along
33
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
34
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
35
\*****************************************************************************/
36
37
#define _GNU_SOURCE
38
#include <stdio.h>
39
#include <stdlib.h>
40
#include <string.h>
41
#include <errno.h>
42
#include <unistd.h>
43
#include <stdarg.h>
44
#include <pthread.h>
45
#include <ctype.h>
46
#include <signal.h>
47
#include <sys/time.h>
48
#include <poll.h>
49
50
#include "src/common/slurm_xlator.h"
51
#include "slurm/slurm.h"
52
#include "src/common/macros.h"
53
#include "src/common/xmalloc.h"
54
#include "src/common/xstring.h"
55
#include "src/common/slurm_protocol_defs.h"
56
#include "src/common/daemonize.h"
57
#include "src/common/xsignal.h"
58
#include "src/common/log.h"
59
#include "src/common/fd.h"
60
#include "src/common/proc_args.h"
61
62
#include "read_config.h"
63
64
#define MAX_POLL_WAIT 500
65
66
/* Local variables */
67
static int foreground = 0;
68
static log_options_t log_opts = 	/* Log to stderr & syslog */
69
	LOG_OPTS_INITIALIZER;
70
static int _sigarray[] = {	/* blocked signals for this process */
71
	SIGINT,  SIGTERM, SIGCHLD, SIGUSR1,
72
	SIGUSR2, SIGTSTP, SIGXCPU, SIGQUIT,
73
	SIGPIPE, SIGALRM, SIGABRT, SIGHUP, 0 };
74
static const char *xtconsumer_path = "/opt/cray/hss/default/bin/xtconsumer";
75
static int slurmsmwd_log_fmt = LOG_FMT_ISO8601_MS;
76
static pthread_t xtc_thread;
77
static pid_t xtc_pid = 0;
78
79
static int stop_running = 0;
80
static uint16_t debug_level = 0;
81
static pthread_mutex_t down_node_lock;
82
static int *down_node;
83
static size_t n_down_node;
84
static size_t down_node_sz;
85
static const char *event_description[] = {
86
	"Invalid Event",
87
	"ec_node_failed",
88
	"ec_node_unavailable"
89
};
90
91
typedef enum event_type {
92
	EVENT_INVALID = 0,
93
	EVENT_NODE_FAILED,
94
	EVENT_NODE_UNAVAILABLE,
95
	EVENT_LIMIT
96
} event_type_t;
97
98
static int _start_xtconsumer(char **xtc_argv, pid_t *pid);
99
100
void shutdown_threads(void) {
101
	stop_running = 1;
102
}
103
104
int getnid(const char *cname, int dim) {
105
        int cabinet, row, chassis, slot, node;
106
        int nodes_per_slot = 4;
107
        int nodes_per_chassis = nodes_per_slot * 16;
108
        int nodes_per_cabinet = nodes_per_chassis * 3;
109
        int nodes_per_row = nodes_per_cabinet * dim;
110
        sscanf(cname, "c%d-%dc%ds%dn%d",
111
                &cabinet, &row, &chassis, &slot, &node);
112
        return cabinet * nodes_per_cabinet + row * nodes_per_row +
113
                chassis * nodes_per_chassis + slot * nodes_per_slot + node;
114
}
115
116
char *getnidlist() {
117
	char *ret = NULL;
118
	size_t idx = 0;
119
	int last_nid = 0;
120
	int in_range = 0;
121
	ret = xstrdup("nid[");
122
	for (idx = 0; idx < n_down_node; idx++) {
123
		int curr_nid = down_node[idx];
124
		if (last_nid == 0) {
125
			xstrfmtcat(ret, "%05d", curr_nid);
126
		} else if (curr_nid == last_nid) {
127
			continue;
128
		} else if (curr_nid - last_nid > 1) {
129
			if (in_range) {
130
				xstrfmtcat(ret, "-%05d", last_nid);
131
			}
132
			xstrfmtcat(ret, ",%05d", curr_nid);
133
			in_range = 0;
134
		} else if (idx == n_down_node - 1) {
135
			xstrfmtcat(ret, "-%05d", curr_nid);
136
		} else {
137
			in_range = 1;
138
		}
139
		last_nid = curr_nid;
140
	}
141
	xstrfmtcat(ret, "]");
142
	return ret;
143
}
144
145
int _mark_nodes_down() {
146
	/* locks are assumed to be held */
147
	int rc = 0;
148
	update_node_msg_t *update_msg = xmalloc(sizeof(update_node_msg_t));
149
150
	slurm_init_update_node_msg(update_msg);
151
152
	update_msg->node_names = getnidlist();
153
	update_msg->node_state = NODE_STATE_NO_RESPOND;
154
155
	info("setting %s to NotResponding", update_msg->node_names);
156
157
	rc = slurm_update_node(update_msg);
158
	if (rc != SLURM_SUCCESS)
159
		error("failed to set %s to NotResponding: %m", update_msg->node_names);
160
161
	slurm_free_update_node_msg(update_msg);
162
	return rc;
163
164
}
165
166
void *process_data(void *arg) {
167
	while (!stop_running) {
168
		slurm_mutex_lock(&down_node_lock);
169
		if (n_down_node > 0) {
170
			slurm_info("down node cnt: %lu", n_down_node);
171
			_mark_nodes_down();
172
			n_down_node = 0;
173
		}
174
		slurm_mutex_unlock(&down_node_lock);
175
		usleep(2000000);
176
177
	}
178
	return NULL;
179
}
180
181
event_type_t _parse_event(const char *input) {
182
	if (strstr(input, "ec_node_failed") != NULL)
183
		return EVENT_NODE_FAILED;
184
	if (strstr(input, "ec_node_unavailable") != NULL)
185
		return EVENT_NODE_UNAVAILABLE;
186
	return EVENT_INVALID;
187
}
188
189
int _cmp_nid(const void *a, const void *b, void *arg) {
190
	int ai = * (const int *) a;
191
	int bi = * (const int *) b;
192
	return ai - bi;
193
}
194
195
char *_trim(char *str) {
196
    char *ptr = str;
197
    ssize_t len = 0;
198
    if (str == NULL) return NULL;
199
    for ( ; isspace(*ptr) && *ptr != 0; ptr++) {
200
        /* that's it */
201
    }
202
    if (*ptr == 0) return ptr;
203
    len = strlen(ptr) - 1;
204
    for ( ; isspace(*(ptr + len)) && len > 0; len--) {
205
        *(ptr + len) = 0;
206
    }
207
    return ptr;
208
}
209
210
void _send_failed_nodes(char *nodelist) {
211
	char *search = nodelist;
212
	char *svptr = NULL;
213
	char *ptr = NULL;
214
	int nid = 0;
215
	slurm_mutex_lock(&down_node_lock);
216
	while ((ptr = strtok_r(search, " ", &svptr)) != NULL) {
217
		search = NULL;
218
		while (*ptr == ':')
219
			ptr++;
220
		ptr = _trim(ptr);
221
		if (strlen(ptr) == 0)
222
			continue;
223
		nid = getnid(ptr, slurmsmwd_cabinets_per_row);
224
		if (nid == 0)
225
			continue;
226
		if (n_down_node + 1 >= down_node_sz) {
227
			size_t alloc_quantity = (n_down_node + 1) * 2;
228
			size_t alloc_size = sizeof(int) * alloc_quantity;
229
			down_node = xrealloc(down_node, alloc_size);
230
			down_node_sz = alloc_quantity;
231
		}
232
		down_node[n_down_node++] = nid;
233
	}
234
	qsort_r(down_node, n_down_node, sizeof(int), _cmp_nid, NULL);
235
	slurm_mutex_unlock(&down_node_lock);
236
}
237
238
/*
239
2017-05-16 07:17:12|2017-05-16 07:17:12|0x40008063 - ec_node_failed|src=:1:s0|::c4-2c0s2n0 ::c4-2c0s2n2 ::c4-2c0s2n3
240
2017-05-16 07:17:12|2017-05-16 07:17:12|0x400020e8 - ec_node_unavailable|src=:1:s0|::c4-2c0s2n2
241
2017-05-16 08:11:01|2017-05-16 08:11:01|0x400020e8 - ec_node_unavailable|src=:1:s0|::c4-2c0s2n0 ::c4-2c0s2n1 ::c4-2c0s2n2 ::c4-2c0s2n3
242
*/
243
void *xtconsumer_listen(void *arg) {
244
	int xtc_fd = 0;
245
	char *xtc_argv[] = {
246
		"xtconsumer",
247
		"-b",
248
		"ec_node_unavailable",
249
		"ec_node_failed"
250
	};
251
	char *line_ptr = NULL;
252
	char *buffer = NULL;
253
	size_t buffer_sz = 0;
254
	size_t buffer_off = 0;
255
	struct pollfd fds;
256
	int i = 0;
257
	int status = 0;
258
259
	xtc_fd = _start_xtconsumer(xtc_argv, &xtc_pid);
260
	debug2("got xtc_pid: %d", xtc_pid);
261
262
	if (xtc_fd < 0) {
263
		error("failed to open xtconsumer: %s", slurm_strerror(slurm_get_errno()));
264
		return NULL;
265
	}
266
267
	/* xtconsumer seems to flush out its stdout on newline (typical)
268
	 * so reading line-by-line seems to be functional for this need
269
	 */
270
	buffer_sz = 1024;
271
	buffer = xmalloc(buffer_sz);
272
	while (!stop_running) {
273
274
		fds.fd = xtc_fd;
275
		fds.events = POLLIN | POLLHUP | POLLRDHUP;
276
		fds.revents = 0;
277
278
		i = poll(&fds, 1, MAX_POLL_WAIT);
279
		if (i == 0) {
280
			continue;
281
		} else if (i < 0) {
282
			error("poll(): %s", slurm_strerror(slurm_get_errno()));
283
			break;
284
		}
285
		if ((fds.revents & POLLIN) == 0)
286
			break;
287
		i = read(xtc_fd, buffer + buffer_off,
288
				 buffer_sz - buffer_off);
289
290
		debug3("read %d bytes", i);
291
		if (i == 0) {
292
			break;
293
		} else if (i < 0) {
294
			if (errno == EAGAIN)
295
				continue;
296
			error("read(): %s", slurm_strerror(slurm_get_errno()));
297
			break;
298
		}
299
		buffer_off += i;
300
		if (buffer_off + 1024 >= buffer_sz) {
301
			buffer_sz *= 2;
302
			buffer = xrealloc(buffer, buffer_sz);
303
		}
304
305
		/* NUL terminate the string to allow strchr to work
306
		 * buffer was expanded above to ensure there would be space
307
		 */
308
		buffer[buffer_off + 1] = '\0';
309
		while ((line_ptr = strchr(buffer, '\n')) != NULL) {
310
			event_type_t event = EVENT_INVALID;
311
			char *node_list = NULL;
312
			char *search = NULL;
313
			char *ptr = NULL;
314
			char *svptr = NULL;
315
			int token_idx = 0;
316
			*line_ptr = '\0';
317
			if (strlen(buffer) == 0)
318
				goto advance_line;
319
			debug3("got line: %s", buffer);
320
			search = buffer;
321
			while ((ptr = strtok_r(search, "|", &svptr)) != NULL) {
322
				search = NULL;
323
				if (token_idx == 2)
324
					event = _parse_event(ptr);
325
				if (token_idx == 4)
326
					node_list = xstrdup(ptr);
327
328
				token_idx++;
329
			}
330
331
			if (event == EVENT_NODE_FAILED ||
332
			    event == EVENT_NODE_UNAVAILABLE) {
333
				info("received event: %s, nodelist: %s",
334
				     event_description[event], node_list);
335
				_send_failed_nodes(node_list);
336
			}
337
338
			xfree(node_list);
339
			node_list = NULL;
340
341
advance_line:
342
			*line_ptr = '\n';
343
			line_ptr++;
344
			for (ptr = buffer; *line_ptr; ptr++, line_ptr++)
345
				*ptr = *line_ptr;
346
			*ptr = *line_ptr;
347
			buffer_off = ptr - buffer;
348
		}
349
350
351
352
	}
353
	info("killing xtconsumer pid %d", xtc_pid);
354
	killpg(xtc_pid, SIGTERM);
355
	usleep(10000);
356
	killpg(xtc_pid, SIGKILL);
357
	waitpid(xtc_pid, &status, 0);
358
	close(xtc_fd);
359
360
361
#if 0
362
cleanup_break:
363
		if (node_list)
364
			xfree(node_list);
365
		break;
366
#endif
367
	xfree(buffer);
368
	return NULL;
369
370
}
371
372
/* _usage - print a message describing the command line arguments */
373
static void _usage(char *prog_name)
374
{
375
        fprintf(stderr, "Usage: %s [OPTIONS]\n", prog_name);
376
        fprintf(stderr, "  -D         \t"
377
                "Run daemon in foreground.\n");
378
        fprintf(stderr, "  -h         \t"
379
                "Print this help message.\n");
380
        fprintf(stderr, "  -v         \t"
381
                "Verbose mode. Multiple -v's increase verbosity.\n");
382
        fprintf(stderr, "  -V         \t"
383
                "Print version information and exit.\n");
384
}
385
386
static void _parse_commandline(int argc, char **argv)
387
{
388
	int c = 0;
389
390
	opterr = 0;
391
	while ((c = getopt(argc, argv, "DhvV")) != -1)
392
		switch (c) {
393
		case 'D':
394
			foreground = 1;
395
			break;
396
		case 'h':
397
			_usage(argv[0]);
398
			exit(0);
399
			break;
400
		case 'v':
401
			debug_level++;
402
			break;
403
		case 'V':
404
			print_slurm_version();
405
			exit(0);
406
			break;
407
		default:
408
			_usage(argv[0]);
409
			exit(1);
410
		}
411
}
412
413
/* Kill the currently running slurmsmwd */
414
static void _kill_old_slurmsmwd(void)
415
{
416
        int fd;
417
        pid_t oldpid;
418
419
        if (slurmsmwd_pid_file == NULL) {
420
                error("No PidFile configured");
421
                return;
422
        }
423
424
        oldpid = read_pidfile(slurmsmwd_pid_file, &fd);
425
        if (oldpid != (pid_t) 0) {
426
                info("Killing old slurmsmwd[%ld]", (long) oldpid);
427
                kill(oldpid, SIGTERM);
428
429
                /*
430
                 * Wait for previous daemon to terminate
431
                 */
432
                if (fd_get_readw_lock(fd) < 0)
433
                        fatal("Unable to wait for readw lock: %m");
434
                (void) close(fd); /* Ignore errors */
435
        }
436
}
437
438
static void _update_logging(void)
439
{
440
441
        /* Preserve execute line arguments (if any) */
442
        if (debug_level) {
443
                slurmsmwd_debug_level = MIN(
444
                        (LOG_LEVEL_INFO + debug_level),
445
                        (LOG_LEVEL_END - 1));
446
        }
447
448
        log_opts.stderr_level  = slurmsmwd_debug_level;
449
        log_opts.logfile_level = slurmsmwd_debug_level;
450
        log_opts.syslog_level  = slurmsmwd_debug_level;
451
452
        if (foreground)
453
                log_opts.syslog_level = LOG_LEVEL_QUIET;
454
        else {
455
                log_opts.stderr_level = LOG_LEVEL_QUIET;
456
                if (slurmsmwd_log_file)
457
                        log_opts.syslog_level = LOG_LEVEL_QUIET;
458
        }
459
460
        log_alter(log_opts, SYSLOG_FACILITY_DAEMON, slurmsmwd_log_file);
461
        log_set_timefmt(slurmsmwd_log_fmt);
462
}
463
464
extern void reconfig(void)
465
{
466
        slurmsmwd_read_config();
467
        _update_logging();
468
}
469
470
/* Reset some signals to their default state to clear any
471
 * inherited signal states */
472
static void _default_sigaction(int sig)
473
{
474
        struct sigaction act;
475
476
        if (sigaction(sig, NULL, &act)) {
477
                error("sigaction(%d): %m", sig);
478
                return;
479
        }
480
        if (act.sa_handler != SIG_IGN)
481
                return;
482
483
        act.sa_handler = SIG_DFL;
484
        if (sigaction(sig, &act, NULL))
485
                error("sigaction(%d): %m", sig);
486
}
487
488
/* _signal_handler - Process daemon-wide signals */
489
static void *_signal_handler(void *no_data)
490
{
491
	int rc, sig;
492
	int sig_array[] = {SIGINT, SIGTERM, SIGHUP, SIGABRT, 0};
493
	sigset_t set;
494
495
	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
496
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
497
498
	/* Make sure no required signals are ignored (possibly inherited) */
499
	_default_sigaction(SIGINT);
500
	_default_sigaction(SIGTERM);
501
	_default_sigaction(SIGHUP);
502
	_default_sigaction(SIGABRT);
503
504
	while (1) {
505
		xsignal_sigset_create(sig_array, &set);
506
		rc = sigwait(&set, &sig);
507
		if (rc == EINTR)
508
			continue;
509
		switch (sig) {
510
		case SIGHUP:	/* kill -1 */
511
			info("Reconfigure signal (SIGHUP) received");
512
			reconfig();
513
			break;
514
		case SIGINT:	/* kill -2  or <CTRL-C> */
515
		case SIGTERM:	/* kill -15 */
516
			info("Terminate signal (SIGINT or SIGTERM) received");
517
			shutdown_threads();
518
			return NULL;	/* Normal termination */
519
		case SIGABRT:	/* abort */
520
			info("SIGABRT received");
521
			abort();	/* Should terminate here */
522
			shutdown_threads();
523
			return NULL;
524
		default:
525
			error("Invalid signal (%d) received", sig);
526
		}
527
	}
528
529
}
530
531
int main(int argc, char **argv) {
532
	pthread_t processing_thread, signal_handler_thread;
533
	pthread_attr_t thread_attr;
534
535
	_parse_commandline(argc, argv);
536
537
	log_init(argv[0], log_opts, LOG_DAEMON, NULL);
538
	reconfig();
539
	slurmsmwd_print_config();
540
541
	_kill_old_slurmsmwd();
542
	if (!foreground) {
543
		daemon(0, 1);
544
		log_alter(log_opts, LOG_DAEMON, slurmsmwd_log_file);
545
	}
546
	if (create_pidfile(slurmsmwd_pid_file, 0) < 0)
547
		fatal("Unable to create pidfile %s", slurmsmwd_pid_file);
548
549
	slurm_mutex_init(&down_node_lock);
550
551
        /* Create attached thread for signal handling */
552
	if (xsignal_block(_sigarray) < 0)
553
		error("Unable to block signals");
554
        slurm_attr_init(&thread_attr);
555
        if (pthread_create(&signal_handler_thread, &thread_attr,
556
			   _signal_handler, NULL))
557
                fatal("pthread_create %m");
558
        slurm_attr_destroy(&thread_attr);
559
560
	slurm_attr_init(&thread_attr);
561
	if (pthread_create(&processing_thread, &thread_attr,
562
			   &process_data, NULL))
563
		fatal("pthread_create %m");
564
	slurm_attr_destroy(&thread_attr);
565
566
	while (!stop_running) {
567
		slurm_attr_init(&thread_attr);
568
		if (pthread_create(&xtc_thread, &thread_attr,
569
				   &xtconsumer_listen, NULL))
570
			fatal("pthread_create %m");
571
		slurm_attr_destroy(&thread_attr);
572
		pthread_join(xtc_thread, NULL);
573
	}
574
575
	pthread_join(processing_thread, NULL);
576
	slurm_mutex_destroy(&down_node_lock);
577
	return 0;
578
}
579
580
static int _start_xtconsumer(char **xtc_argv, pid_t *pid)
581
{
582
	int cc, i;
583
	pid_t cpid;
584
	int pfd[2] = { -1, -1 };
585
586
	if (access(xtconsumer_path, R_OK | X_OK) < 0) {
587
		error("Can not execute: %s", xtconsumer_path);
588
		return -1;
589
	}
590
	if (pipe(pfd) != 0) {
591
		error("pipe(): %s", slurm_strerror(slurm_get_errno()));
592
		return -1;
593
	}
594
595
	if ((cpid = fork()) == 0) {
596
		cc = sysconf(_SC_OPEN_MAX);
597
		dup2(pfd[1], STDERR_FILENO);
598
		dup2(pfd[1], STDOUT_FILENO);
599
		for (i = 0; i < cc; i++) {
600
			if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
601
				close(i);
602
		}
603
		setpgid(0, 0);
604
		execv(xtconsumer_path, xtc_argv);
605
		error("execv(): %s", slurm_strerror(slurm_get_errno()));
606
		exit(127);
607
	} else if (cpid < 0) {
608
		close(pfd[0]);
609
		close(pfd[1]);
610
		error("fork(): %s", slurm_strerror(slurm_get_errno()));
611
		return -1;
612
	}
613
	*pid = cpid;
614
	close(pfd[1]);
615
	return pfd[0];
616
}
(-)a/contribs/cray/slurmsmwd/read_config.c (+110 lines)
Line 0 Link Here
1
/*****************************************************************************\
2
 *  read_config.c - Read configuration file for slurmwmwd
3
 *****************************************************************************
4
 *  Copyright (C) 2017 Regents of the University of California
5
 *  Written by Douglas Jacobsen <dmjacobsen@lbl.gov>
6
 *
7
 *  This file is part of SLURM, a resource management program.
8
 *  For details, see <https://slurm.schedmd.com>.
9
 *  Please also read the included file: DISCLAIMER.
10
 *
11
 *  SLURM is free software; you can redistribute it and/or modify it under
12
 *  the terms of the GNU General Public License as published by the Free
13
 *  Software Foundation; either version 2 of the License, or (at your option)
14
 *  any later version.
15
 *
16
 *  In addition, as a special exception, the copyright holders give permission
17
 *  to link the code of portions of this program with the OpenSSL library under
18
 *  certain conditions as described in each individual source file, and
19
 *  distribute linked combinations including the two. You must obey the GNU
20
 *  General Public License in all respects for all of the code used other than
21
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
22
 *  exception to your version of the file(s), but you are not obligated to do
23
 *  so. If you do not wish to do so, delete this exception statement from your
24
 *  version.  If you delete this exception statement from all source files in
25
 *  the program, then also delete it here.
26
 *
27
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
28
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
30
 *  details.
31
 *
32
 *  You should have received a copy of the GNU General Public License along
33
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
34
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
35
\*****************************************************************************/
36
37
#include <stdlib.h>
38
#include <string.h>
39
#include <sys/stat.h>
40
#include <sys/types.h>
41
#include <unistd.h>
42
43
#include "src/common/slurm_xlator.h"	/* Must be first */
44
#include "src/common/parse_config.h"
45
#include "read_config.h"
46
47
/* Global variables */
48
uint16_t slurmsmwd_cabinets_per_row = 0;
49
uint16_t slurmsmwd_debug_level = LOG_LEVEL_INFO;
50
char *slurmsmwd_log_file = NULL;
51
char *slurmsmwd_pid_file = NULL;
52
53
static s_p_options_t slurmsmwd_options[] = {
54
	{"CabinetsPerRow", S_P_UINT16},
55
	{"DebugLevel", S_P_STRING},
56
	{"LogFile", S_P_STRING},
57
	{"PidFile", S_P_STRING},
58
	{NULL}
59
};
60
61
extern void slurmsmwd_print_config(void)
62
{
63
	debug2("slurmsmwd configuration");
64
	debug2("CabinetsPerRow = %u", slurmsmwd_cabinets_per_row);
65
	debug2("DebugLevel     = %u", slurmsmwd_debug_level);
66
	debug2("LogFile        = %s", slurmsmwd_log_file);
67
	debug2("PidFile        = %s", slurmsmwd_pid_file);
68
}
69
70
static void _validate_config(void)
71
{
72
	if (slurmsmwd_cabinets_per_row == 0)
73
		fatal("slurmsmwd.conf: CabinetsPerRow must not be zero");
74
}
75
76
/* Load configuration file contents into global variables.
77
 * Call slurmsmwd_free_config to free memory. */
78
extern void slurmsmwd_read_config(void)
79
{
80
	char *config_file = NULL;
81
	char *temp_str = NULL;
82
	s_p_hashtbl_t *tbl = NULL;
83
	struct stat config_stat;
84
85
	config_file = get_extra_conf_path("slurmsmwd.conf");
86
	if (stat(config_file, &config_stat) < 0)
87
		fatal("Can't stat slurmsmwd.conf %s: %m", config_file);
88
	tbl = s_p_hashtbl_create(slurmsmwd_options);
89
	if (s_p_parse_file(tbl, NULL, config_file, false) == SLURM_ERROR)
90
		fatal("Can't parse slurmsmwd.conf %s: %m", config_file);
91
92
	s_p_get_uint16(&slurmsmwd_cabinets_per_row, "CabinetsPerRow", tbl);
93
	s_p_get_string(&slurmsmwd_log_file, "LogFile", tbl);
94
	if (s_p_get_string(&temp_str, "DebugLevel", tbl)) {
95
		slurmsmwd_debug_level = log_string2num(temp_str);
96
		if (slurmsmwd_debug_level == (uint16_t) NO_VAL)
97
			fatal("Invalid DebugLevel %s", temp_str);
98
		xfree(temp_str);
99
	}
100
	if (s_p_get_string(&temp_str, "PidFile", tbl)) {
101
		slurmsmwd_pid_file = temp_str;
102
	} else {
103
		slurmsmwd_pid_file = xstrdup("/var/run/slurmsmwd.pid");
104
	}
105
106
	_validate_config();
107
108
	s_p_hashtbl_destroy(tbl);
109
	xfree(config_file);
110
}
(-)a/contribs/cray/slurmsmwd/read_config.h (+56 lines)
Line 0 Link Here
1
/*****************************************************************************\
2
 *  read_config.h - Define symbols used to read configuration file for
3
 *  slurmsmwd
4
 *****************************************************************************
5
 *  Copyright (C) 2017 Regents of the University of California
6
 *  Written by Douglas Jacobsen <dmjacobsen@lbl.gov>
7
 *
8
 *  This file is part of SLURM, a resource management program.
9
 *  For details, see <https://slurm.schedmd.com>.
10
 *  Please also read the included file: DISCLAIMER.
11
 *
12
 *  SLURM is free software; you can redistribute it and/or modify it under
13
 *  the terms of the GNU General Public License as published by the Free
14
 *  Software Foundation; either version 2 of the License, or (at your option)
15
 *  any later version.
16
 *
17
 *  In addition, as a special exception, the copyright holders give permission
18
 *  to link the code of portions of this program with the OpenSSL library under
19
 *  certain conditions as described in each individual source file, and
20
 *  distribute linked combinations including the two. You must obey the GNU
21
 *  General Public License in all respects for all of the code used other than
22
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
23
 *  exception to your version of the file(s), but you are not obligated to do
24
 *  so. If you do not wish to do so, delete this exception statement from your
25
 *  version.  If you delete this exception statement from all source files in
26
 *  the program, then also delete it here.
27
 *
28
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
29
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
31
 *  details.
32
 *
33
 *  You should have received a copy of the GNU General Public License along
34
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
35
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
36
\*****************************************************************************/
37
38
#ifndef _HAVE_SLURMSMWD_READ_CONFIG_H
39
#define _HAVE_SLURMSMWD_READ_CONFIG_H
40
41
#include <inttypes.h>
42
#include <sys/types.h>
43
#include <unistd.h>
44
45
extern uint16_t slurmsmwd_cabinets_per_row;
46
extern uint16_t slurmsmwd_debug_level;
47
extern char *slurmsmwd_log_file;
48
extern char *slurmsmwd_pid_file;
49
50
/* Configuration functions */
51
52
/* Load configuration file contents into global variables. */
53
extern void slurmsmwd_read_config(void);
54
extern void slurmsmwd_print_config(void);
55
56
#endif	/* _HAVE_SLURMSMWD_READ_CONFIG_H */
(-)a/contribs/cray/slurmsmwd/slurmsmwd.service.in (+13 lines)
Line 0 Link Here
1
[Unit]
2
Description=Cray SMW xtconsumer Slurm Helper daemon
3
After=network.target munge.service
4
ConditionPathExists=@sysconfdir@/slurmsmwd.conf
5
6
[Service]
7
Type=forking
8
ExecStart=@sbindir@/slurmsmwd
9
ExecReload=/bin/kill -HUP $MAINPID
10
PIDFile=/var/run/slurmsmwd.pid
11
12
[Install]
13
WantedBy=multi-user.target
(-)a/slurm.spec (+31 lines)
Lines 371-376 running on the node, or any user who has allocated resources on the node Link Here
371
according to the Slurm
371
according to the Slurm
372
%endif
372
%endif
373
%if %{slurm_with cray}
374
%package smw
375
Summary: support daemons and software for the Cray SMW
376
Group: System Environment/Base
377
Requires: slurm
378
Obsoletes: smw
379
%description smw
380
support daeamons and software for the Cray SMW.  Includes slurmsmwd which
381
notifies slurm about failed nodes.
382
%endif
383
373
#############################################################################
384
#############################################################################
374
%prep
385
%prep
Lines 402-409 according to the Slurm Link Here
402
%__make %{?_smp_mflags}
413
%__make %{?_smp_mflags}
414
%if %{slurm_with cray}
415
cd contribs/cray/slurmsmwd
416
%__make %{?_smp_mflags}
417
%endif
418
403
%install
419
%install
420
%__make %{?_smp_mflags} install
421
%if %{slurm_with cray}
422
cd contribs/cray/slurmsmwd
423
%__make %{?_smp_mflags} install
424
%endif
404
# Strip out some dependencies
425
# Strip out some dependencies
Lines 431-436 elif [ -d /etc/init.d ]; then Link Here
431
   ln -s ../../etc/init.d/slurmdbd $RPM_BUILD_ROOT/usr/sbin/rcslurmdbd
452
   ln -s ../../etc/init.d/slurmdbd $RPM_BUILD_ROOT/usr/sbin/rcslurmdbd
432
fi
453
fi
454
%if %{slurm_with cray}
455
   install -D -m644 contribs/cray/slurmsmwd/slurmsmwd.service $RPM_BUILD_ROOT/usr/lib/systemd/system/slurmsmwd.service
456
%endif
457
433
# Do not package Slurm's version of libpmi on Cray systems.
458
# Do not package Slurm's version of libpmi on Cray systems.
434
# Cray's version of libpmi should be used.
459
# Cray's version of libpmi should be used.
435
%if %{slurm_with cray} || %{slurm_with cray_alps}
460
%if %{slurm_with cray} || %{slurm_with cray_alps}
Lines 975-980 rm -rf $RPM_BUILD_ROOT Link Here
975
%files -f pam.files pam_slurm
1000
%files -f pam.files pam_slurm
976
%defattr(-,root,root)
1001
%defattr(-,root,root)
977
%endif
1002
%endif
1003
1004
%if %{slurm_with cray}
1005
%files smw
1006
%{_sbindir}/slurmsmwd
1007
/usr/lib/systemd/system/slurmsmwd.service
1008
%endif
978
#############################################################################
1009
#############################################################################
979
%pre
1010
%pre

Return to ticket 3318