|
Lines 129-134
typedef struct {
Link Here
|
| 129 |
List job_list = NULL; /* job_record list */ |
129 |
List job_list = NULL; /* job_record list */ |
| 130 |
time_t last_job_update; /* time of last update to job records */ |
130 |
time_t last_job_update; /* time of last update to job records */ |
| 131 |
|
131 |
|
|
|
132 |
List purge_list = NULL; /* job files to delete */ |
| 133 |
pthread_t purge_thread; |
| 134 |
pthread_mutex_t purge_lock = PTHREAD_MUTEX_INITIALIZER; |
| 135 |
pthread_cond_t purge_cond = PTHREAD_COND_INITIALIZER; |
| 136 |
|
| 132 |
/* Local variables */ |
137 |
/* Local variables */ |
| 133 |
static int bf_min_age_reserve = 0; |
138 |
static int bf_min_age_reserve = 0; |
| 134 |
static uint32_t delay_boot = 0; |
139 |
static uint32_t delay_boot = 0; |
|
Lines 143-150
static struct job_record **job_array_hash_t = NULL;
Link Here
|
| 143 |
static bool kill_invalid_dep; |
148 |
static bool kill_invalid_dep; |
| 144 |
static time_t last_file_write_time = (time_t) 0; |
149 |
static time_t last_file_write_time = (time_t) 0; |
| 145 |
static uint32_t max_array_size = NO_VAL; |
150 |
static uint32_t max_array_size = NO_VAL; |
| 146 |
static bool purge_quit = false; |
|
|
| 147 |
static struct timeval purge_start_time = {0, 0}; |
| 148 |
static bitstr_t *requeue_exit = NULL; |
151 |
static bitstr_t *requeue_exit = NULL; |
| 149 |
static bitstr_t *requeue_exit_hold = NULL; |
152 |
static bitstr_t *requeue_exit_hold = NULL; |
| 150 |
static int select_serial = -1; |
153 |
static int select_serial = -1; |
|
Lines 165-170
static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
Link Here
|
| 165 |
static char *_copy_nodelist_no_dup(char *node_list); |
168 |
static char *_copy_nodelist_no_dup(char *node_list); |
| 166 |
static struct job_record *_create_job_record(int *error_code, |
169 |
static struct job_record *_create_job_record(int *error_code, |
| 167 |
uint32_t num_jobs); |
170 |
uint32_t num_jobs); |
|
|
171 |
static void _delete_job_details(struct job_record *job_entry); |
| 168 |
static void _del_batch_list_rec(void *x); |
172 |
static void _del_batch_list_rec(void *x); |
| 169 |
static void _delete_job_desc_files(uint32_t job_id); |
173 |
static void _delete_job_desc_files(uint32_t job_id); |
| 170 |
static slurmdb_qos_rec_t *_determine_and_validate_qos( |
174 |
static slurmdb_qos_rec_t *_determine_and_validate_qos( |
|
Lines 181-187
static void _job_array_comp(struct job_record *job_ptr, bool was_running);
Link Here
|
| 181 |
static int _job_create(job_desc_msg_t * job_specs, int allocate, int will_run, |
185 |
static int _job_create(job_desc_msg_t * job_specs, int allocate, int will_run, |
| 182 |
struct job_record **job_rec_ptr, uid_t submit_uid, |
186 |
struct job_record **job_rec_ptr, uid_t submit_uid, |
| 183 |
char **err_msg, uint16_t protocol_version); |
187 |
char **err_msg, uint16_t protocol_version); |
| 184 |
static void _job_purge_start(void); |
|
|
| 185 |
static void _job_timed_out(struct job_record *job_ptr); |
188 |
static void _job_timed_out(struct job_record *job_ptr); |
| 186 |
static void _kill_dependent(struct job_record *job_ptr); |
189 |
static void _kill_dependent(struct job_record *job_ptr); |
| 187 |
static void _list_delete_job(void *job_entry); |
190 |
static void _list_delete_job(void *job_entry); |
|
Lines 205-210
static void _pack_pending_job_details(struct job_details *detail_ptr,
Link Here
|
| 205 |
uint16_t protocol_version); |
208 |
uint16_t protocol_version); |
| 206 |
static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max); |
209 |
static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max); |
| 207 |
static void _purge_missing_jobs(int node_inx, time_t now); |
210 |
static void _purge_missing_jobs(int node_inx, time_t now); |
|
|
211 |
static void *_purge_thread(void *no_data); |
| 208 |
static int _read_data_array_from_file(int fd, char *file_name, char ***data, |
212 |
static int _read_data_array_from_file(int fd, char *file_name, char ***data, |
| 209 |
uint32_t * size, |
213 |
uint32_t * size, |
| 210 |
struct job_record *job_ptr); |
214 |
struct job_record *job_ptr); |
|
Lines 480-491
static struct job_record *_create_job_record(int *error_code, uint32_t num_jobs)
Link Here
|
| 480 |
|
484 |
|
| 481 |
|
485 |
|
| 482 |
/* |
486 |
/* |
| 483 |
* delete_job_details - delete a job's detail record and clear it's pointer |
487 |
* _delete_job_details - delete a job's detail record and clear it's pointer |
| 484 |
* this information can be deleted as soon as the job is allocated |
|
|
| 485 |
* resources and running (could need to restart batch job) |
| 486 |
* IN job_entry - pointer to job_record to clear the record of |
488 |
* IN job_entry - pointer to job_record to clear the record of |
| 487 |
*/ |
489 |
*/ |
| 488 |
void delete_job_details(struct job_record *job_entry) |
490 |
static void _delete_job_details(struct job_record *job_entry) |
| 489 |
{ |
491 |
{ |
| 490 |
int i; |
492 |
int i; |
| 491 |
|
493 |
|
|
Lines 493-500
void delete_job_details(struct job_record *job_entry)
Link Here
|
| 493 |
return; |
495 |
return; |
| 494 |
|
496 |
|
| 495 |
xassert (job_entry->details->magic == DETAILS_MAGIC); |
497 |
xassert (job_entry->details->magic == DETAILS_MAGIC); |
| 496 |
if (IS_JOB_FINISHED(job_entry)) |
498 |
|
| 497 |
_delete_job_desc_files(job_entry->job_id); |
499 |
/* |
|
|
500 |
* Queue up job to have the batch script and environment deleted. |
| 501 |
* This is handled by a separate thread to limit the amount of |
| 502 |
* time purge_old_job needs to spend holding locks. |
| 503 |
*/ |
| 504 |
if (IS_JOB_FINISHED(job_entry)) { |
| 505 |
int *job_id = xmalloc(sizeof(uint32_t)); |
| 506 |
*job_id = job_entry->job_id; |
| 507 |
list_enqueue(purge_list, job_id); |
| 508 |
} |
| 498 |
|
509 |
|
| 499 |
xfree(job_entry->details->acctg_freq); |
510 |
xfree(job_entry->details->acctg_freq); |
| 500 |
for (i=0; i<job_entry->details->argc; i++) |
511 |
for (i=0; i<job_entry->details->argc; i++) |
|
Lines 3761-3767
void dump_job_desc(job_desc_msg_t * job_specs)
Link Here
|
| 3761 |
debug3(" %s", buf); |
3772 |
debug3(" %s", buf); |
| 3762 |
} |
3773 |
} |
| 3763 |
|
3774 |
|
| 3764 |
|
|
|
| 3765 |
/* |
3775 |
/* |
| 3766 |
* init_job_conf - initialize the job configuration tables and values. |
3776 |
* init_job_conf - initialize the job configuration tables and values. |
| 3767 |
* this should be called after creating node information, but |
3777 |
* this should be called after creating node information, but |
|
Lines 3774-3785
void dump_job_desc(job_desc_msg_t * job_specs)
Link Here
|
| 3774 |
*/ |
3784 |
*/ |
| 3775 |
int init_job_conf(void) |
3785 |
int init_job_conf(void) |
| 3776 |
{ |
3786 |
{ |
|
|
3787 |
pthread_attr_t thread_attr; |
| 3788 |
|
| 3777 |
if (job_list == NULL) { |
3789 |
if (job_list == NULL) { |
| 3778 |
job_count = 0; |
3790 |
job_count = 0; |
| 3779 |
job_list = list_create(_list_delete_job); |
3791 |
job_list = list_create(_list_delete_job); |
| 3780 |
} |
3792 |
} |
| 3781 |
|
3793 |
|
| 3782 |
last_job_update = time(NULL); |
3794 |
last_job_update = time(NULL); |
|
|
3795 |
|
| 3796 |
if (!purge_list) { |
| 3797 |
purge_list = list_create(NULL); |
| 3798 |
|
| 3799 |
slurm_attr_init(&thread_attr); |
| 3800 |
if (pthread_attr_setdetachstate(&thread_attr, |
| 3801 |
PTHREAD_CREATE_DETACHED)) |
| 3802 |
error("pthread_attr_setdetachstate error %m"); |
| 3803 |
while (pthread_create(&purge_thread, &thread_attr, |
| 3804 |
_purge_thread, NULL)) { |
| 3805 |
error("pthread_create error %m"); |
| 3806 |
sleep(1); |
| 3807 |
} |
| 3808 |
slurm_attr_destroy(&thread_attr); |
| 3809 |
} |
| 3810 |
|
| 3783 |
return SLURM_SUCCESS; |
3811 |
return SLURM_SUCCESS; |
| 3784 |
} |
3812 |
} |
| 3785 |
|
3813 |
|
|
Lines 8354-8360
static void _list_delete_job(void *job_entry)
Link Here
|
| 8354 |
*job_pptr = job_ptr->job_array_next_t; |
8382 |
*job_pptr = job_ptr->job_array_next_t; |
| 8355 |
} |
8383 |
} |
| 8356 |
|
8384 |
|
| 8357 |
delete_job_details(job_ptr); |
8385 |
_delete_job_details(job_ptr); |
| 8358 |
xfree(job_ptr->account); |
8386 |
xfree(job_ptr->account); |
| 8359 |
xfree(job_ptr->admin_comment); |
8387 |
xfree(job_ptr->admin_comment); |
| 8360 |
xfree(job_ptr->alias_list); |
8388 |
xfree(job_ptr->alias_list); |
|
Lines 8435-8446
extern int list_find_job_id(void *job_entry, void *key)
Link Here
|
| 8435 |
return 0; |
8463 |
return 0; |
| 8436 |
} |
8464 |
} |
| 8437 |
|
8465 |
|
| 8438 |
static void _job_purge_start(void) |
|
|
| 8439 |
{ |
| 8440 |
purge_quit = false; |
| 8441 |
gettimeofday(&purge_start_time, NULL); |
| 8442 |
} |
| 8443 |
|
| 8444 |
/* |
8466 |
/* |
| 8445 |
* _list_find_job_old - find old entries in the job list, |
8467 |
* _list_find_job_old - find old entries in the job list, |
| 8446 |
* see common/list.h for documentation, key is ignored |
8468 |
* see common/list.h for documentation, key is ignored |
|
Lines 8451-8469
static int _list_find_job_old(void *job_entry, void *key)
Link Here
|
| 8451 |
time_t kill_age, min_age, now = time(NULL);; |
8473 |
time_t kill_age, min_age, now = time(NULL);; |
| 8452 |
struct job_record *job_ptr = (struct job_record *)job_entry; |
8474 |
struct job_record *job_ptr = (struct job_record *)job_entry; |
| 8453 |
uint16_t cleaning = 0; |
8475 |
uint16_t cleaning = 0; |
| 8454 |
struct timeval tv_now = {0, 0}; |
|
|
| 8455 |
long delta_t; |
| 8456 |
|
| 8457 |
if (purge_quit) |
| 8458 |
return 0; |
| 8459 |
gettimeofday(&tv_now, NULL); |
| 8460 |
delta_t = (tv_now.tv_sec - purge_start_time.tv_sec) * 1000000; |
| 8461 |
delta_t += tv_now.tv_usec; |
| 8462 |
delta_t -= purge_start_time.tv_usec; |
| 8463 |
if (delta_t > 1000000) { |
| 8464 |
purge_quit = true; |
| 8465 |
return 0; |
| 8466 |
} |
| 8467 |
|
8476 |
|
| 8468 |
if (IS_JOB_COMPLETING(job_ptr) && !LOTS_OF_AGENTS) { |
8477 |
if (IS_JOB_COMPLETING(job_ptr) && !LOTS_OF_AGENTS) { |
| 8469 |
kill_age = now - (slurmctld_conf.kill_wait + |
8478 |
kill_age = now - (slurmctld_conf.kill_wait + |
|
Lines 9750-9756
void purge_old_job(void)
Link Here
|
| 9750 |
{ |
9759 |
{ |
| 9751 |
ListIterator job_iterator; |
9760 |
ListIterator job_iterator; |
| 9752 |
struct job_record *job_ptr; |
9761 |
struct job_record *job_ptr; |
| 9753 |
int i; |
9762 |
int i, purge_job_count; |
|
|
9763 |
|
| 9764 |
if ((purge_job_count = list_count(purge_list))) |
| 9765 |
error("%s: job file deletion is falling behind, " |
| 9766 |
"%d left to remove", __func__, purge_job_count); |
| 9754 |
|
9767 |
|
| 9755 |
job_iterator = list_iterator_create(job_list); |
9768 |
job_iterator = list_iterator_create(job_list); |
| 9756 |
while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
9769 |
while ((job_ptr = (struct job_record *) list_next(job_iterator))) { |
|
Lines 9796-9806
void purge_old_job(void)
Link Here
|
| 9796 |
} |
9809 |
} |
| 9797 |
list_iterator_destroy(job_iterator); |
9810 |
list_iterator_destroy(job_iterator); |
| 9798 |
|
9811 |
|
| 9799 |
_job_purge_start(); |
|
|
| 9800 |
i = list_delete_all(job_list, &_list_find_job_old, ""); |
9812 |
i = list_delete_all(job_list, &_list_find_job_old, ""); |
| 9801 |
if (i) { |
9813 |
if (i) { |
| 9802 |
debug2("purge_old_job: purged %d old job records", i); |
9814 |
debug2("purge_old_job: purged %d old job records", i); |
| 9803 |
last_job_update = time(NULL); |
9815 |
last_job_update = time(NULL); |
|
|
9816 |
slurm_cond_signal(&purge_cond); |
| 9804 |
} |
9817 |
} |
| 9805 |
} |
9818 |
} |
| 9806 |
|
9819 |
|
|
Lines 9819-9824
extern int purge_job_record(uint32_t job_id)
Link Here
|
| 9819 |
return list_delete_all(job_list, &list_find_job_id, (void *) &job_id); |
9832 |
return list_delete_all(job_list, &list_find_job_id, (void *) &job_id); |
| 9820 |
} |
9833 |
} |
| 9821 |
|
9834 |
|
|
|
9835 |
/* |
| 9836 |
* _purge_thread - separate thread to remove job batch/environ files |
| 9837 |
* from the state directory. Runs async from purge_old_jobs to avoid |
| 9838 |
* holding locks while the files are removed, which can cause performance |
| 9839 |
* problems under high throughput conditions. |
| 9840 |
* |
| 9841 |
* Uses the purge_cond to wakeup on demand, then works through the global |
| 9842 |
* purge_list of job_ids and removes their files. |
| 9843 |
*/ |
| 9844 |
static void *_purge_thread(void *no_data) |
| 9845 |
{ |
| 9846 |
int *job_id; |
| 9847 |
|
| 9848 |
/* |
| 9849 |
* Use the purge_list as a queue. _delete_job_details() always |
| 9850 |
* enqueues (at the end), while _purge_thread consumes off the |
| 9851 |
* front. |
| 9852 |
* |
| 9853 |
* Note that, due to the cleanup in sync_job_files on slurmctld |
| 9854 |
* restart, it is okay if the purge_list is not completely |
| 9855 |
* empty when slurmctld is stopped. |
| 9856 |
* |
| 9857 |
* There is a potential race condition if the job numbers have |
| 9858 |
* wrapped between _purge_thread removing the state files and |
| 9859 |
* get_next_job_id trying to re-assign it. This is mitigated |
| 9860 |
* the the call to _dup_job_file_test() ensuring the state |
| 9861 |
* directory has been removed. |
| 9862 |
*/ |
| 9863 |
|
| 9864 |
slurm_mutex_lock(&purge_lock); |
| 9865 |
while (true) { |
| 9866 |
slurm_cond_wait(&purge_cond, &purge_lock); |
| 9867 |
debug2("%s: starting, %d jobs to purge", __func__, |
| 9868 |
list_count(purge_list)); |
| 9869 |
|
| 9870 |
while ((job_id = list_dequeue(purge_list))) { |
| 9871 |
debug2("%s: purging files from job %d", |
| 9872 |
__func__, *job_id); |
| 9873 |
_delete_job_desc_files(*job_id); |
| 9874 |
xfree(job_id); |
| 9875 |
} |
| 9876 |
} |
| 9877 |
|
| 9878 |
slurm_mutex_unlock(&purge_lock); |
| 9879 |
return NULL; |
| 9880 |
} |
| 9881 |
|
| 9882 |
|
| 9822 |
|
9883 |
|
| 9823 |
/* |
9884 |
/* |
| 9824 |
* reset_job_bitmaps - reestablish bitmaps for existing jobs. |
9885 |
* reset_job_bitmaps - reestablish bitmaps for existing jobs. |