View | Details | Raw Unified | Return to ticket 1001 | Differences between
and this patch

Collapse All | Expand All

(-)a/slurm/slurm.h.in (+2 lines)
Lines 2624-2631 extern void slurm_free_license_info_msg PARAMS((license_info_msg_t *)); Link Here
2624
 *	JOB/STEP SIGNALING FUNCTIONS
2624
 *	JOB/STEP SIGNALING FUNCTIONS
2625
\*****************************************************************************/
2625
\*****************************************************************************/
2626
2626
2627
/* NOTE: Only 4 bits available in current data structure */
2627
#define KILL_JOB_BATCH	0x0001	/* signal batch shell only */
2628
#define KILL_JOB_BATCH	0x0001	/* signal batch shell only */
2628
#define KILL_JOB_ARRAY	0x0002	/* kill all elements of a job array */
2629
#define KILL_JOB_ARRAY	0x0002	/* kill all elements of a job array */
2630
#define KILL_STEPS_ONLY	0x0004	/* Do not signal batch script */
2629
2631
2630
/*
2632
/*
2631
 * slurm_kill_job - send the specified signal to all steps of an existing job
2633
 * slurm_kill_job - send the specified signal to all steps of an existing job
(-)a/src/slurmctld/job_mgr.c (-17 / +46 lines)
Lines 195-201 static void _reset_step_bitmaps(struct job_record *job_ptr); Link Here
195
static int  _resume_job_nodes(struct job_record *job_ptr, bool indf_susp);
195
static int  _resume_job_nodes(struct job_record *job_ptr, bool indf_susp);
196
static void _send_job_kill(struct job_record *job_ptr);
196
static void _send_job_kill(struct job_record *job_ptr);
197
static int  _set_job_id(struct job_record *job_ptr);
197
static int  _set_job_id(struct job_record *job_ptr);
198
static void _signal_batch_job(struct job_record *job_ptr, uint16_t signal);
198
static void _signal_batch_job(struct job_record *job_ptr,
199
			      uint16_t signal,
200
			      uint16_t flags);
199
static void _signal_job(struct job_record *job_ptr, int signal);
201
static void _signal_job(struct job_record *job_ptr, int signal);
200
static void _suspend_job(struct job_record *job_ptr, uint16_t op,
202
static void _suspend_job(struct job_record *job_ptr, uint16_t op,
201
			 bool indf_susp);
203
			 bool indf_susp);
Lines 3680-3687 extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t flags, Link Here
3680
	}
3682
	}
3681
3683
3682
	if (IS_JOB_RUNNING(job_ptr)) {
3684
	if (IS_JOB_RUNNING(job_ptr)) {
3683
		if (signal == SIGKILL) {
3685
		if ((signal == SIGKILL)
3684
			/* No need to signal steps, deallocate kills them */
3686
		    && !(flags & KILL_STEPS_ONLY)
3687
		    && !(flags & KILL_JOB_BATCH)) {
3688
			/* No need to signal steps, deallocate kills them
3689
			 */
3685
			job_ptr->time_last_active	= now;
3690
			job_ptr->time_last_active	= now;
3686
			job_ptr->end_time		= now;
3691
			job_ptr->end_time		= now;
3687
			last_job_update			= now;
3692
			last_job_update			= now;
Lines 3689-3699 extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t flags, Link Here
3689
			build_cg_bitmap(job_ptr);
3694
			build_cg_bitmap(job_ptr);
3690
			job_completion_logger(job_ptr, false);
3695
			job_completion_logger(job_ptr, false);
3691
			deallocate_nodes(job_ptr, false, false, preempt);
3696
			deallocate_nodes(job_ptr, false, false, preempt);
3692
		} else if (flags & KILL_JOB_BATCH) {//
3697
		} else if (job_ptr->batch_flag
3693
			if (job_ptr->batch_flag)
3698
			   && (flags & KILL_STEPS_ONLY
3694
				_signal_batch_job(job_ptr, signal);
3699
			       || flags & KILL_JOB_BATCH)) {
3695
			else
3700
			_signal_batch_job(job_ptr, signal, flags);
3696
				return ESLURM_JOB_SCRIPT_MISSING;
3701
		} else if ((flags & KILL_JOB_BATCH) && !job_ptr->batch_flag) {
3702
			return ESLURM_JOB_SCRIPT_MISSING;
3697
		} else {
3703
		} else {
3698
			_signal_job(job_ptr, signal);
3704
			_signal_job(job_ptr, signal);
3699
		}
3705
		}
Lines 3708-3714 extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t flags, Link Here
3708
}
3714
}
3709
3715
3710
static void
3716
static void
3711
_signal_batch_job(struct job_record *job_ptr, uint16_t signal)
3717
_signal_batch_job(struct job_record *job_ptr, uint16_t signal, uint16_t flags)
3712
{
3718
{
3713
	bitoff_t i;
3719
	bitoff_t i;
3714
	kill_tasks_msg_t *kill_tasks_msg = NULL;
3720
	kill_tasks_msg_t *kill_tasks_msg = NULL;
Lines 3719-3726 _signal_batch_job(struct job_record *job_ptr, uint16_t signal) Link Here
3719
	xassert(job_ptr->batch_host);
3725
	xassert(job_ptr->batch_host);
3720
	i = bit_ffs(job_ptr->node_bitmap);
3726
	i = bit_ffs(job_ptr->node_bitmap);
3721
	if (i < 0) {
3727
	if (i < 0) {
3722
		error("_signal_batch_job JobId=%u lacks assigned nodes",
3728
		error("%s: JobId=%u lacks assigned nodes",
3723
		      job_ptr->job_id);
3729
		      __func__, job_ptr->job_id);
3730
		return;
3731
	}
3732
	if (flags > 0xf) {	/* Top 4 bits used for KILL_* flags */
3733
		error("%s: signal flags %u for job %u exceed limit",
3734
		      __func__, flags, job_ptr->job_id);
3735
		return;
3736
	}
3737
	if (signal > 0xfff) {	/* Top 4 bits used for KILL_* flags */
3738
		error("%s: signal value %u for job %u exceed limit",
3739
		      __func__, signal, job_ptr->job_id);
3724
		return;
3740
		return;
3725
	}
3741
	}
3726
3742
Lines 3741-3753 _signal_batch_job(struct job_record *job_ptr, uint16_t signal) Link Here
3741
	kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t));
3757
	kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t));
3742
	kill_tasks_msg->job_id      = job_ptr->job_id;
3758
	kill_tasks_msg->job_id      = job_ptr->job_id;
3743
	kill_tasks_msg->job_step_id = NO_VAL;
3759
	kill_tasks_msg->job_step_id = NO_VAL;
3744
	/* Encode the KILL_JOB_BATCH flag for
3760
3745
	 * stepd to know if has to signal only
3761
	/* Encode the KILL_JOB_BATCH|KILL_STEPS_ONLY flags for stepd to know if
3746
	 * the batch script. The job was submitted
3762
	 * has to signal only the batch script or only the steps.
3747
	 * using the --signal=B:sig sbatch option.
3763
	 * The job was submitted using the --signal=B:sig
3764
	 * or without B sbatch option.
3748
	 */
3765
	 */
3749
	z = KILL_JOB_BATCH << 24;
3766
	if (flags == KILL_JOB_BATCH)
3750
	kill_tasks_msg->signal = z|signal;
3767
		z = KILL_JOB_BATCH << 24;
3768
	else if (flags == KILL_STEPS_ONLY)
3769
		z = KILL_STEPS_ONLY << 24;
3770
3771
	kill_tasks_msg->signal = z | signal;
3751
3772
3752
	agent_args->msg_args = kill_tasks_msg;
3773
	agent_args->msg_args = kill_tasks_msg;
3753
	agent_args->node_count = 1;/* slurm/477 be sure to update node_count */
3774
	agent_args->node_count = 1;/* slurm/477 be sure to update node_count */
Lines 5969-5974 void job_time_limit(void) Link Here
5969
			if ((job_ptr->warn_time) &&
5990
			if ((job_ptr->warn_time) &&
5970
			    (job_ptr->warn_time + PERIODIC_TIMEOUT + now >=
5991
			    (job_ptr->warn_time + PERIODIC_TIMEOUT + now >=
5971
			     job_ptr->end_time)) {
5992
			     job_ptr->end_time)) {
5993
5994
				/* If --signal B option was not specified,
5995
				 * signal only the steps but not the batch step.
5996
				 */
5997
				if (job_ptr->warn_flags == 0)
5998
					job_ptr->warn_flags = KILL_STEPS_ONLY;
5999
6000
5972
				debug("Warning signal %u to job %u ",
6001
				debug("Warning signal %u to job %u ",
5973
				      job_ptr->warn_signal, job_ptr->job_id);
6002
				      job_ptr->warn_signal, job_ptr->job_id);
5974
				(void) job_signal(job_ptr->job_id,
6003
				(void) job_signal(job_ptr->job_id,
(-)a/src/slurmd/slurmd/req.c (-6 / +17 lines)
Lines 2386-2391 _rpc_signal_tasks(slurm_msg_t *msg) Link Here
2386
	int               rc = SLURM_SUCCESS;
2386
	int               rc = SLURM_SUCCESS;
2387
	uid_t             req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL);
2387
	uid_t             req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL);
2388
	kill_tasks_msg_t *req = (kill_tasks_msg_t *) msg->data;
2388
	kill_tasks_msg_t *req = (kill_tasks_msg_t *) msg->data;
2389
	uint32_t flag;
2390
	uint32_t sig;
2389
2391
2390
#ifdef HAVE_XCPU
2392
#ifdef HAVE_XCPU
2391
	if (!_slurm_authorized_user(req_uid)) {
2393
	if (!_slurm_authorized_user(req_uid)) {
Lines 2394-2403 _rpc_signal_tasks(slurm_msg_t *msg) Link Here
2394
	}
2396
	}
2395
#endif
2397
#endif
2396
2398
2397
	debug("Sending signal %u to step %u.%u", req->signal, req->job_id,
2399
	flag = req->signal >> 24;
2398
	      req->job_step_id);
2400
	sig  = req->signal & 0xfff;
2399
	rc = _signal_jobstep(req->job_id, req->job_step_id, req_uid,
2401
2400
			     req->signal);
2402
	if (flag & KILL_STEPS_ONLY) {
2403
		debug("%s: sending signal %u to all steps job %u flag %u",
2404
		      __func__, sig, req->job_id, flag);
2405
		_kill_all_active_steps(req->job_id, sig, false);
2406
	} else {
2407
		debug("%s: sending signal %u to step %u.%u", __func__,
2408
		      req->signal, req->job_id, req->job_step_id);
2409
		rc = _signal_jobstep(req->job_id, req->job_step_id, req_uid,
2410
				     req->signal);
2411
	}
2401
	slurm_send_rc_msg(msg, rc);
2412
	slurm_send_rc_msg(msg, rc);
2402
}
2413
}
2403
2414
Lines 5261-5267 _rpc_forward_data(slurm_msg_t *msg) Link Here
5261
		rc = EINVAL;
5272
		rc = EINVAL;
5262
		goto done;
5273
		goto done;
5263
	}
5274
	}
5264
	
5275
5265
	/* connect to specified address */
5276
	/* connect to specified address */
5266
	fd = socket(AF_UNIX, SOCK_STREAM, 0);
5277
	fd = socket(AF_UNIX, SOCK_STREAM, 0);
5267
	if (fd < 0) {
5278
	if (fd < 0) {
Lines 5289-5295 _rpc_forward_data(slurm_msg_t *msg) Link Here
5289
	req_uid = htonl(req->len);
5300
	req_uid = htonl(req->len);
5290
	safe_write(fd, &req_uid, sizeof(uint32_t));
5301
	safe_write(fd, &req_uid, sizeof(uint32_t));
5291
	safe_write(fd, req->data, req->len);
5302
	safe_write(fd, req->data, req->len);
5292
	
5303
5293
rwfail:
5304
rwfail:
5294
done:
5305
done:
5295
	if (fd >= 0)
5306
	if (fd >= 0)

Return to ticket 1001