|
Line 0
Link Here
|
|
|
1 |
/*****************************************************************************\ |
| 2 |
* main.c - Primary logic for slurmsmwd |
| 3 |
***************************************************************************** |
| 4 |
* Copyright (C) 2017 Regents of the University of California |
| 5 |
* Written by Douglas Jacobsen <dmjacobsen@lbl.gov> |
| 6 |
* |
| 7 |
* This file is part of SLURM, a resource management program. |
| 8 |
* For details, see <https://slurm.schedmd.com>. |
| 9 |
* Please also read the included file: DISCLAIMER. |
| 10 |
* |
| 11 |
* SLURM is free software; you can redistribute it and/or modify it under |
| 12 |
* the terms of the GNU General Public License as published by the Free |
| 13 |
* Software Foundation; either version 2 of the License, or (at your option) |
| 14 |
* any later version. |
| 15 |
* |
| 16 |
* In addition, as a special exception, the copyright holders give permission |
| 17 |
* to link the code of portions of this program with the OpenSSL library under |
| 18 |
* certain conditions as described in each individual source file, and |
| 19 |
* distribute linked combinations including the two. You must obey the GNU |
| 20 |
* General Public License in all respects for all of the code used other than |
| 21 |
* OpenSSL. If you modify file(s) with this exception, you may extend this |
| 22 |
* exception to your version of the file(s), but you are not obligated to do |
| 23 |
* so. If you do not wish to do so, delete this exception statement from your |
| 24 |
* version. If you delete this exception statement from all source files in |
| 25 |
* the program, then also delete it here. |
| 26 |
* |
| 27 |
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| 28 |
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 29 |
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| 30 |
* details. |
| 31 |
* |
| 32 |
* You should have received a copy of the GNU General Public License along |
| 33 |
* with SLURM; if not, write to the Free Software Foundation, Inc., |
| 34 |
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 35 |
\*****************************************************************************/ |
| 36 |
|
| 37 |
#define _GNU_SOURCE |
| 38 |
#include <stdio.h> |
| 39 |
#include <stdlib.h> |
| 40 |
#include <string.h> |
| 41 |
#include <errno.h> |
| 42 |
#include <unistd.h> |
| 43 |
#include <stdarg.h> |
| 44 |
#include <pthread.h> |
| 45 |
#include <ctype.h> |
| 46 |
#include <signal.h> |
| 47 |
#include <sys/time.h> |
| 48 |
#include <poll.h> |
| 49 |
|
| 50 |
#include "src/common/slurm_xlator.h" |
| 51 |
#include "slurm/slurm.h" |
| 52 |
#include "src/common/macros.h" |
| 53 |
#include "src/common/xmalloc.h" |
| 54 |
#include "src/common/xstring.h" |
| 55 |
#include "src/common/slurm_protocol_defs.h" |
| 56 |
#include "src/common/daemonize.h" |
| 57 |
#include "src/common/xsignal.h" |
| 58 |
#include "src/common/log.h" |
| 59 |
#include "src/common/proc_args.h" |
| 60 |
|
| 61 |
#include "read_config.h" |
| 62 |
|
| 63 |
#define MAX_POLL_WAIT 500 |
| 64 |
|
| 65 |
/* Local variables */ |
| 66 |
static int foreground = 0; |
| 67 |
static log_options_t log_opts = /* Log to stderr & syslog */ |
| 68 |
LOG_OPTS_INITIALIZER; |
| 69 |
static int _sigarray[] = { /* blocked signals for this process */ |
| 70 |
SIGINT, SIGTERM, SIGCHLD, SIGUSR1, |
| 71 |
SIGUSR2, SIGTSTP, SIGXCPU, SIGQUIT, |
| 72 |
SIGPIPE, SIGALRM, SIGABRT, SIGHUP, 0 }; |
| 73 |
static const char *xtconsumer_path = "/opt/cray/hss/default/bin/xtconsumer"; |
| 74 |
static int slurmsmwd_log_fmt = LOG_FMT_ISO8601_MS; |
| 75 |
static pthread_t xtc_thread; |
| 76 |
static pid_t xtc_pid = 0; |
| 77 |
|
| 78 |
static int stop_running = 0; |
| 79 |
static uint16_t debug_level = 0; |
| 80 |
static pthread_mutex_t down_node_lock; |
| 81 |
static int *down_node; |
| 82 |
static size_t n_down_node; |
| 83 |
static size_t down_node_sz; |
| 84 |
static const char *event_description[] = { |
| 85 |
"Invalid Event", |
| 86 |
"ec_node_failed", |
| 87 |
"ec_node_unavailable" |
| 88 |
}; |
| 89 |
|
| 90 |
typedef enum event_type { |
| 91 |
EVENT_INVALID = 0, |
| 92 |
EVENT_NODE_FAILED, |
| 93 |
EVENT_NODE_UNAVAILABLE, |
| 94 |
EVENT_LIMIT |
| 95 |
} event_type_t; |
| 96 |
|
| 97 |
static int _start_xtconsumer(char **xtc_argv, pid_t *pid); |
| 98 |
|
| 99 |
void shutdown_threads(void) { |
| 100 |
stop_running = 1; |
| 101 |
} |
| 102 |
|
| 103 |
int getnid(const char *cname, int dim) { |
| 104 |
int cabinet, row, chassis, slot, node; |
| 105 |
int nodes_per_slot = 4; |
| 106 |
int nodes_per_chassis = nodes_per_slot * 16; |
| 107 |
int nodes_per_cabinet = nodes_per_chassis * 3; |
| 108 |
int nodes_per_row = nodes_per_cabinet * dim; |
| 109 |
sscanf(cname, "c%d-%dc%ds%dn%d", |
| 110 |
&cabinet, &row, &chassis, &slot, &node); |
| 111 |
return cabinet * nodes_per_cabinet + row * nodes_per_row + |
| 112 |
chassis * nodes_per_chassis + slot * nodes_per_slot + node; |
| 113 |
} |
| 114 |
|
| 115 |
char *getnidlist() { |
| 116 |
char *ret = NULL; |
| 117 |
size_t idx = 0; |
| 118 |
int last_nid = 0; |
| 119 |
int in_range = 0; |
| 120 |
ret = xstrdup("nid["); |
| 121 |
for (idx = 0; idx < n_down_node; idx++) { |
| 122 |
int curr_nid = down_node[idx]; |
| 123 |
if (last_nid == 0) { |
| 124 |
xstrfmtcat(ret, "%05d", curr_nid); |
| 125 |
} else if (curr_nid == last_nid) { |
| 126 |
continue; |
| 127 |
} else if (curr_nid - last_nid > 1) { |
| 128 |
if (in_range) { |
| 129 |
xstrfmtcat(ret, "-%05d", last_nid); |
| 130 |
} |
| 131 |
xstrfmtcat(ret, ",%05d", curr_nid); |
| 132 |
in_range = 0; |
| 133 |
} else if (idx == n_down_node - 1) { |
| 134 |
xstrfmtcat(ret, "-%05d", curr_nid); |
| 135 |
} else { |
| 136 |
in_range = 1; |
| 137 |
} |
| 138 |
last_nid = curr_nid; |
| 139 |
} |
| 140 |
xstrfmtcat(ret, "]"); |
| 141 |
return ret; |
| 142 |
} |
| 143 |
|
| 144 |
int _mark_nodes_down() { |
| 145 |
/* locks are assumed to be held */ |
| 146 |
int rc = 0; |
| 147 |
update_node_msg_t *update_msg = xmalloc(sizeof(update_node_msg_t)); |
| 148 |
|
| 149 |
slurm_init_update_node_msg(update_msg); |
| 150 |
|
| 151 |
update_msg->node_names = getnidlist(); |
| 152 |
update_msg->node_state = NODE_STATE_NO_RESPOND; |
| 153 |
|
| 154 |
info("setting %s to NotResponding", update_msg->node_names); |
| 155 |
|
| 156 |
rc = slurm_update_node(update_msg); |
| 157 |
if (rc != SLURM_SUCCESS) |
| 158 |
error("failed to set %s to NotResponding: %m", update_msg->node_names); |
| 159 |
|
| 160 |
slurm_free_update_node_msg(update_msg); |
| 161 |
return rc; |
| 162 |
|
| 163 |
} |
| 164 |
|
| 165 |
void *process_data(void *arg) { |
| 166 |
while (!stop_running) { |
| 167 |
slurm_mutex_lock(&down_node_lock); |
| 168 |
if (n_down_node > 0) { |
| 169 |
slurm_info("down node cnt: %lu", n_down_node); |
| 170 |
_mark_nodes_down(); |
| 171 |
n_down_node = 0; |
| 172 |
} |
| 173 |
slurm_mutex_unlock(&down_node_lock); |
| 174 |
usleep(2000000); |
| 175 |
|
| 176 |
} |
| 177 |
return NULL; |
| 178 |
} |
| 179 |
|
| 180 |
event_type_t _parse_event(const char *input) { |
| 181 |
if (strstr(input, "ec_node_failed") != NULL) |
| 182 |
return EVENT_NODE_FAILED; |
| 183 |
if (strstr(input, "ec_node_unavailable") != NULL) |
| 184 |
return EVENT_NODE_UNAVAILABLE; |
| 185 |
return EVENT_INVALID; |
| 186 |
} |
| 187 |
|
| 188 |
int _cmp_nid(const void *a, const void *b, void *arg) { |
| 189 |
int ai = * (const int *) a; |
| 190 |
int bi = * (const int *) b; |
| 191 |
return ai - bi; |
| 192 |
} |
| 193 |
|
| 194 |
char *_trim(char *str) { |
| 195 |
char *ptr = str; |
| 196 |
ssize_t len = 0; |
| 197 |
if (str == NULL) return NULL; |
| 198 |
for ( ; isspace(*ptr) && *ptr != 0; ptr++) { |
| 199 |
/* that's it */ |
| 200 |
} |
| 201 |
if (*ptr == 0) return ptr; |
| 202 |
len = strlen(ptr) - 1; |
| 203 |
for ( ; isspace(*(ptr + len)) && len > 0; len--) { |
| 204 |
*(ptr + len) = 0; |
| 205 |
} |
| 206 |
return ptr; |
| 207 |
} |
| 208 |
|
| 209 |
void _send_failed_nodes(char *nodelist) { |
| 210 |
char *search = nodelist; |
| 211 |
char *svptr = NULL; |
| 212 |
char *ptr = NULL; |
| 213 |
int nid = 0; |
| 214 |
slurm_mutex_lock(&down_node_lock); |
| 215 |
while ((ptr = strtok_r(search, " ", &svptr)) != NULL) { |
| 216 |
search = NULL; |
| 217 |
while (*ptr == ':') |
| 218 |
ptr++; |
| 219 |
ptr = _trim(ptr); |
| 220 |
if (strlen(ptr) == 0) |
| 221 |
continue; |
| 222 |
nid = getnid(ptr, slurmsmwd_cabinets_per_row); |
| 223 |
if (nid == 0) |
| 224 |
continue; |
| 225 |
if (n_down_node + 1 >= down_node_sz) { |
| 226 |
size_t alloc_quantity = (n_down_node + 1) * 2; |
| 227 |
size_t alloc_size = sizeof(int) * alloc_quantity; |
| 228 |
down_node = xrealloc(down_node, alloc_size); |
| 229 |
down_node_sz = alloc_quantity; |
| 230 |
} |
| 231 |
down_node[n_down_node++] = nid; |
| 232 |
} |
| 233 |
qsort_r(down_node, n_down_node, sizeof(int), _cmp_nid, NULL); |
| 234 |
slurm_mutex_unlock(&down_node_lock); |
| 235 |
} |
| 236 |
|
| 237 |
/* |
| 238 |
2017-05-16 07:17:12|2017-05-16 07:17:12|0x40008063 - ec_node_failed|src=:1:s0|::c4-2c0s2n0 ::c4-2c0s2n2 ::c4-2c0s2n3 |
| 239 |
2017-05-16 07:17:12|2017-05-16 07:17:12|0x400020e8 - ec_node_unavailable|src=:1:s0|::c4-2c0s2n2 |
| 240 |
2017-05-16 08:11:01|2017-05-16 08:11:01|0x400020e8 - ec_node_unavailable|src=:1:s0|::c4-2c0s2n0 ::c4-2c0s2n1 ::c4-2c0s2n2 ::c4-2c0s2n3 |
| 241 |
*/ |
| 242 |
void *xtconsumer_listen(void *arg) { |
| 243 |
int xtc_fd = 0; |
| 244 |
char *xtc_argv[] = { |
| 245 |
"xtconsumer", |
| 246 |
"-b", |
| 247 |
"ec_node_unavailable", |
| 248 |
"ec_node_failed" |
| 249 |
}; |
| 250 |
char *line_ptr = NULL; |
| 251 |
char *buffer = NULL; |
| 252 |
size_t buffer_sz = 0; |
| 253 |
size_t buffer_off = 0; |
| 254 |
struct pollfd fds; |
| 255 |
int i = 0; |
| 256 |
int status = 0; |
| 257 |
|
| 258 |
xtc_fd = _start_xtconsumer(xtc_argv, &xtc_pid); |
| 259 |
debug2("got xtc_pid: %d", xtc_pid); |
| 260 |
|
| 261 |
if (xtc_fd < 0) { |
| 262 |
error("failed to open xtconsumer: %s", slurm_strerror(slurm_get_errno())); |
| 263 |
return NULL; |
| 264 |
} |
| 265 |
|
| 266 |
/* xtconsumer seems to flush out its stdout on newline (typical) |
| 267 |
* so reading line-by-line seems to be functional for this need |
| 268 |
*/ |
| 269 |
buffer_sz = 1024; |
| 270 |
buffer = xmalloc(buffer_sz); |
| 271 |
while (!stop_running) { |
| 272 |
|
| 273 |
fds.fd = xtc_fd; |
| 274 |
fds.events = POLLIN | POLLHUP | POLLRDHUP; |
| 275 |
fds.revents = 0; |
| 276 |
|
| 277 |
i = poll(&fds, 1, MAX_POLL_WAIT); |
| 278 |
if (i == 0) { |
| 279 |
continue; |
| 280 |
} else if (i < 0) { |
| 281 |
error("poll(): %s", slurm_strerror(slurm_get_errno())); |
| 282 |
break; |
| 283 |
} |
| 284 |
if ((fds.revents & POLLIN) == 0) |
| 285 |
break; |
| 286 |
i = read(xtc_fd, buffer + buffer_off, |
| 287 |
buffer_sz - buffer_off); |
| 288 |
|
| 289 |
debug3("read %d bytes", i); |
| 290 |
if (i == 0) { |
| 291 |
break; |
| 292 |
} else if (i < 0) { |
| 293 |
if (errno == EAGAIN) |
| 294 |
continue; |
| 295 |
error("read(): %s", slurm_strerror(slurm_get_errno())); |
| 296 |
break; |
| 297 |
} |
| 298 |
buffer_off += i; |
| 299 |
if (buffer_off + 1024 >= buffer_sz) { |
| 300 |
buffer_sz *= 2; |
| 301 |
buffer = xrealloc(buffer, buffer_sz); |
| 302 |
} |
| 303 |
|
| 304 |
/* NUL terminate the string to allow strchr to work |
| 305 |
* buffer was expanded above to ensure there would be space |
| 306 |
*/ |
| 307 |
buffer[buffer_off + 1] = '\0'; |
| 308 |
while ((line_ptr = strchr(buffer, '\n')) != NULL) { |
| 309 |
event_type_t event = EVENT_INVALID; |
| 310 |
char *node_list = NULL; |
| 311 |
char *search = NULL; |
| 312 |
char *ptr = NULL; |
| 313 |
char *svptr = NULL; |
| 314 |
int token_idx = 0; |
| 315 |
*line_ptr = '\0'; |
| 316 |
if (strlen(buffer) == 0) |
| 317 |
goto advance_line; |
| 318 |
debug3("got line: %s", buffer); |
| 319 |
search = buffer; |
| 320 |
while ((ptr = strtok_r(search, "|", &svptr)) != NULL) { |
| 321 |
search = NULL; |
| 322 |
if (token_idx == 2) |
| 323 |
event = _parse_event(ptr); |
| 324 |
if (token_idx == 4) |
| 325 |
node_list = xstrdup(ptr); |
| 326 |
|
| 327 |
token_idx++; |
| 328 |
} |
| 329 |
|
| 330 |
if (event == EVENT_NODE_FAILED || |
| 331 |
event == EVENT_NODE_UNAVAILABLE) { |
| 332 |
info("received event: %s, nodelist: %s", |
| 333 |
event_description[event], node_list); |
| 334 |
_send_failed_nodes(node_list); |
| 335 |
} |
| 336 |
|
| 337 |
xfree(node_list); |
| 338 |
node_list = NULL; |
| 339 |
|
| 340 |
advance_line: |
| 341 |
*line_ptr = '\n'; |
| 342 |
line_ptr++; |
| 343 |
for (ptr = buffer; *line_ptr; ptr++, line_ptr++) |
| 344 |
*ptr = *line_ptr; |
| 345 |
*ptr = *line_ptr; |
| 346 |
buffer_off = ptr - buffer; |
| 347 |
} |
| 348 |
|
| 349 |
|
| 350 |
|
| 351 |
} |
| 352 |
info("killing xtconsumer pid %d", xtc_pid); |
| 353 |
killpg(xtc_pid, SIGTERM); |
| 354 |
usleep(10000); |
| 355 |
killpg(xtc_pid, SIGKILL); |
| 356 |
waitpid(xtc_pid, &status, 0); |
| 357 |
close(xtc_fd); |
| 358 |
|
| 359 |
|
| 360 |
#if 0 |
| 361 |
cleanup_break: |
| 362 |
if (node_list) |
| 363 |
xfree(node_list); |
| 364 |
break; |
| 365 |
#endif |
| 366 |
xfree(buffer); |
| 367 |
return NULL; |
| 368 |
|
| 369 |
} |
| 370 |
|
| 371 |
/* _usage - print a message describing the command line arguments */ |
| 372 |
static void _usage(char *prog_name) |
| 373 |
{ |
| 374 |
fprintf(stderr, "Usage: %s [OPTIONS]\n", prog_name); |
| 375 |
fprintf(stderr, " -D \t" |
| 376 |
"Run daemon in foreground.\n"); |
| 377 |
fprintf(stderr, " -h \t" |
| 378 |
"Print this help message.\n"); |
| 379 |
fprintf(stderr, " -v \t" |
| 380 |
"Verbose mode. Multiple -v's increase verbosity.\n"); |
| 381 |
fprintf(stderr, " -V \t" |
| 382 |
"Print version information and exit.\n"); |
| 383 |
} |
| 384 |
|
| 385 |
static void _parse_commandline(int argc, char **argv) |
| 386 |
{ |
| 387 |
int c = 0; |
| 388 |
|
| 389 |
opterr = 0; |
| 390 |
while ((c = getopt(argc, argv, "DhvV")) != -1) |
| 391 |
switch (c) { |
| 392 |
case 'D': |
| 393 |
foreground = 1; |
| 394 |
break; |
| 395 |
case 'h': |
| 396 |
_usage(argv[0]); |
| 397 |
exit(0); |
| 398 |
break; |
| 399 |
case 'v': |
| 400 |
debug_level++; |
| 401 |
break; |
| 402 |
case 'V': |
| 403 |
print_slurm_version(); |
| 404 |
exit(0); |
| 405 |
break; |
| 406 |
default: |
| 407 |
_usage(argv[0]); |
| 408 |
exit(1); |
| 409 |
} |
| 410 |
} |
| 411 |
|
| 412 |
static void _update_logging(void) |
| 413 |
{ |
| 414 |
|
| 415 |
/* Preserve execute line arguments (if any) */ |
| 416 |
if (debug_level) { |
| 417 |
slurmsmwd_debug_level = MIN( |
| 418 |
(LOG_LEVEL_INFO + debug_level), |
| 419 |
(LOG_LEVEL_END - 1)); |
| 420 |
} |
| 421 |
|
| 422 |
log_opts.stderr_level = slurmsmwd_debug_level; |
| 423 |
log_opts.logfile_level = slurmsmwd_debug_level; |
| 424 |
log_opts.syslog_level = slurmsmwd_debug_level; |
| 425 |
|
| 426 |
if (foreground) |
| 427 |
log_opts.syslog_level = LOG_LEVEL_QUIET; |
| 428 |
else { |
| 429 |
log_opts.stderr_level = LOG_LEVEL_QUIET; |
| 430 |
if (slurmsmwd_log_file) |
| 431 |
log_opts.syslog_level = LOG_LEVEL_QUIET; |
| 432 |
} |
| 433 |
|
| 434 |
log_alter(log_opts, SYSLOG_FACILITY_DAEMON, slurmsmwd_log_file); |
| 435 |
log_set_timefmt(slurmsmwd_log_fmt); |
| 436 |
} |
| 437 |
|
| 438 |
extern void reconfig(void) |
| 439 |
{ |
| 440 |
slurmsmwd_read_config(); |
| 441 |
_update_logging(); |
| 442 |
} |
| 443 |
|
| 444 |
/* Reset some signals to their default state to clear any |
| 445 |
* inherited signal states */ |
| 446 |
static void _default_sigaction(int sig) |
| 447 |
{ |
| 448 |
struct sigaction act; |
| 449 |
|
| 450 |
if (sigaction(sig, NULL, &act)) { |
| 451 |
error("sigaction(%d): %m", sig); |
| 452 |
return; |
| 453 |
} |
| 454 |
if (act.sa_handler != SIG_IGN) |
| 455 |
return; |
| 456 |
|
| 457 |
act.sa_handler = SIG_DFL; |
| 458 |
if (sigaction(sig, &act, NULL)) |
| 459 |
error("sigaction(%d): %m", sig); |
| 460 |
} |
| 461 |
|
| 462 |
/* _signal_handler - Process daemon-wide signals */ |
| 463 |
static void *_signal_handler(void *no_data) |
| 464 |
{ |
| 465 |
int rc, sig; |
| 466 |
int sig_array[] = {SIGINT, SIGTERM, SIGHUP, SIGABRT, 0}; |
| 467 |
sigset_t set; |
| 468 |
|
| 469 |
(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); |
| 470 |
(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); |
| 471 |
|
| 472 |
/* Make sure no required signals are ignored (possibly inherited) */ |
| 473 |
_default_sigaction(SIGINT); |
| 474 |
_default_sigaction(SIGTERM); |
| 475 |
_default_sigaction(SIGHUP); |
| 476 |
_default_sigaction(SIGABRT); |
| 477 |
|
| 478 |
while (1) { |
| 479 |
xsignal_sigset_create(sig_array, &set); |
| 480 |
rc = sigwait(&set, &sig); |
| 481 |
if (rc == EINTR) |
| 482 |
continue; |
| 483 |
switch (sig) { |
| 484 |
case SIGHUP: /* kill -1 */ |
| 485 |
info("Reconfigure signal (SIGHUP) received"); |
| 486 |
reconfig(); |
| 487 |
break; |
| 488 |
case SIGINT: /* kill -2 or <CTRL-C> */ |
| 489 |
case SIGTERM: /* kill -15 */ |
| 490 |
info("Terminate signal (SIGINT or SIGTERM) received"); |
| 491 |
shutdown_threads(); |
| 492 |
return NULL; /* Normal termination */ |
| 493 |
case SIGABRT: /* abort */ |
| 494 |
info("SIGABRT received"); |
| 495 |
abort(); /* Should terminate here */ |
| 496 |
shutdown_threads(); |
| 497 |
return NULL; |
| 498 |
default: |
| 499 |
error("Invalid signal (%d) received", sig); |
| 500 |
} |
| 501 |
} |
| 502 |
|
| 503 |
} |
| 504 |
|
| 505 |
int main(int argc, char **argv) { |
| 506 |
pthread_t processing_thread, signal_handler_thread; |
| 507 |
pthread_attr_t thread_attr; |
| 508 |
|
| 509 |
_parse_commandline(argc, argv); |
| 510 |
|
| 511 |
log_init(argv[0], log_opts, LOG_DAEMON, NULL); |
| 512 |
reconfig(); |
| 513 |
slurmsmwd_print_config(); |
| 514 |
|
| 515 |
if (!foreground) { |
| 516 |
daemon(0, 0); |
| 517 |
} |
| 518 |
if (create_pidfile("/var/run/slurmsmwd.pid", 0) < 0) |
| 519 |
fatal("Unable to create pidfile /var/run/slurmswmd.pid"); |
| 520 |
|
| 521 |
slurm_mutex_init(&down_node_lock); |
| 522 |
|
| 523 |
/* Create attached thread for signal handling */ |
| 524 |
if (xsignal_block(_sigarray) < 0) |
| 525 |
error("Unable to block signals"); |
| 526 |
slurm_attr_init(&thread_attr); |
| 527 |
if (pthread_create(&signal_handler_thread, &thread_attr, |
| 528 |
_signal_handler, NULL)) |
| 529 |
fatal("pthread_create %m"); |
| 530 |
slurm_attr_destroy(&thread_attr); |
| 531 |
|
| 532 |
slurm_attr_init(&thread_attr); |
| 533 |
if (pthread_create(&processing_thread, &thread_attr, |
| 534 |
&process_data, NULL)) |
| 535 |
fatal("pthread_create %m"); |
| 536 |
slurm_attr_destroy(&thread_attr); |
| 537 |
|
| 538 |
while (!stop_running) { |
| 539 |
slurm_attr_init(&thread_attr); |
| 540 |
if (pthread_create(&xtc_thread, &thread_attr, |
| 541 |
&xtconsumer_listen, NULL)) |
| 542 |
fatal("pthread_create %m"); |
| 543 |
slurm_attr_destroy(&thread_attr); |
| 544 |
pthread_join(xtc_thread, NULL); |
| 545 |
} |
| 546 |
|
| 547 |
pthread_join(processing_thread, NULL); |
| 548 |
slurm_mutex_destroy(&down_node_lock); |
| 549 |
return 0; |
| 550 |
} |
| 551 |
|
| 552 |
static int _start_xtconsumer(char **xtc_argv, pid_t *pid) |
| 553 |
{ |
| 554 |
int cc, i; |
| 555 |
pid_t cpid; |
| 556 |
int pfd[2] = { -1, -1 }; |
| 557 |
|
| 558 |
if (access(xtconsumer_path, R_OK | X_OK) < 0) { |
| 559 |
error("Can not execute: %s", xtconsumer_path); |
| 560 |
return -1; |
| 561 |
} |
| 562 |
if (pipe(pfd) != 0) { |
| 563 |
error("pipe(): %s", slurm_strerror(slurm_get_errno())); |
| 564 |
return -1; |
| 565 |
} |
| 566 |
|
| 567 |
if ((cpid = fork()) == 0) { |
| 568 |
cc = sysconf(_SC_OPEN_MAX); |
| 569 |
dup2(pfd[1], STDERR_FILENO); |
| 570 |
dup2(pfd[1], STDOUT_FILENO); |
| 571 |
for (i = 0; i < cc; i++) { |
| 572 |
if ((i != STDERR_FILENO) && (i != STDOUT_FILENO)) |
| 573 |
close(i); |
| 574 |
} |
| 575 |
setpgid(0, 0); |
| 576 |
execv(xtconsumer_path, xtc_argv); |
| 577 |
error("execv(): %s", slurm_strerror(slurm_get_errno())); |
| 578 |
exit(127); |
| 579 |
} else if (cpid < 0) { |
| 580 |
close(pfd[0]); |
| 581 |
close(pfd[1]); |
| 582 |
error("fork(): %s", slurm_strerror(slurm_get_errno())); |
| 583 |
return -1; |
| 584 |
} |
| 585 |
*pid = cpid; |
| 586 |
close(pfd[1]); |
| 587 |
return pfd[0]; |
| 588 |
} |