|
Lines 94-100
int build_alpsc_pe_info(stepd_step_rec_t *job,
Link Here
|
| 94 |
} |
94 |
} |
| 95 |
|
95 |
|
| 96 |
// Fill in the structure |
96 |
// Fill in the structure |
| 97 |
alpsc_pe_info->totalPEs = job->ntasks; |
97 |
if (job->pack_jobid != NO_VAL) { |
|
|
98 |
alpsc_pe_info->totalPEs = job->pack_ntasks; |
| 99 |
} else { |
| 100 |
alpsc_pe_info->totalPEs = job->ntasks; |
| 101 |
} |
| 98 |
alpsc_pe_info->firstPeHere = _get_first_pe(job); |
102 |
alpsc_pe_info->firstPeHere = _get_first_pe(job); |
| 99 |
alpsc_pe_info->pesHere = job->node_tasks; |
103 |
alpsc_pe_info->pesHere = job->node_tasks; |
| 100 |
alpsc_pe_info->peDepth = job->cpus_per_task; |
104 |
alpsc_pe_info->peDepth = job->cpus_per_task; |
|
Lines 146-152
static int *_get_cmd_map(stepd_step_rec_t *job)
Link Here
|
| 146 |
int cmd_index, i, pe; |
150 |
int cmd_index, i, pe; |
| 147 |
int *cmd_map = NULL; |
151 |
int *cmd_map = NULL; |
| 148 |
|
152 |
|
| 149 |
size = job->ntasks * sizeof(int); |
153 |
int nnodes = job->nnodes; |
|
|
154 |
int ntasks = job->ntasks; |
| 155 |
char *complete_nodelist = job->msg->complete_nodelist; |
| 156 |
uint16_t *tasks_to_launch = job->msg->tasks_to_launch; |
| 157 |
if (job->pack_jobid != NO_VAL) { |
| 158 |
nnodes = job->pack_nnodes; |
| 159 |
ntasks = job->pack_ntasks; |
| 160 |
complete_nodelist = job->pack_node_list; |
| 161 |
tasks_to_launch = job->pack_task_cnts; |
| 162 |
} |
| 163 |
|
| 164 |
size = ntasks * sizeof(int); |
| 150 |
cmd_map = xmalloc(size); |
165 |
cmd_map = xmalloc(size); |
| 151 |
if (job->mpmd_set) { |
166 |
if (job->mpmd_set) { |
| 152 |
// Multiple programs, fill in from mpmd_set information |
167 |
// Multiple programs, fill in from mpmd_set information |
|
Lines 198-223
static int *_get_pe_nid_map(stepd_step_rec_t *job)
Link Here
|
| 198 |
int32_t *nodes = NULL; |
213 |
int32_t *nodes = NULL; |
| 199 |
int tasks_to_launch_sum, nid; |
214 |
int tasks_to_launch_sum, nid; |
| 200 |
|
215 |
|
| 201 |
size = job->ntasks * sizeof(int); |
216 |
int nnodes = job->nnodes; |
|
|
217 |
int ntasks = job->ntasks; |
| 218 |
char *complete_nodelist = job->msg->complete_nodelist; |
| 219 |
uint16_t *tasks_to_launch = job->msg->tasks_to_launch; |
| 220 |
if (job->pack_jobid != NO_VAL) { |
| 221 |
nnodes = job->pack_nnodes; |
| 222 |
ntasks = job->pack_ntasks; |
| 223 |
complete_nodelist = job->pack_node_list; |
| 224 |
tasks_to_launch = job->pack_task_cnts; |
| 225 |
} |
| 226 |
|
| 227 |
size = ntasks * sizeof(int); |
| 202 |
pe_nid_map = xmalloc(size); |
228 |
pe_nid_map = xmalloc(size); |
| 203 |
|
229 |
|
| 204 |
// If we have it, just copy the mpmd set information |
230 |
// If we have it, just copy the mpmd set information |
|
|
231 |
/* TODO: this is not configured for hetjob yet */ |
| 205 |
if (job->mpmd_set && job->mpmd_set->placement) { |
232 |
if (job->mpmd_set && job->mpmd_set->placement) { |
| 206 |
// mpmd_set->placement is an int * too so this works |
233 |
// mpmd_set->placement is an int * too so this works |
| 207 |
memcpy(pe_nid_map, job->mpmd_set->placement, size); |
234 |
memcpy(pe_nid_map, job->mpmd_set->placement, size); |
| 208 |
} else { |
235 |
} else { |
| 209 |
// Initialize to -1 so we can tell if we missed any |
236 |
// Initialize to -1 so we can tell if we missed any |
| 210 |
for (i = 0; i < job->ntasks; i++) { |
237 |
for (i = 0; i < ntasks; i++) { |
| 211 |
pe_nid_map[i] = -1; |
238 |
pe_nid_map[i] = -1; |
| 212 |
} |
239 |
} |
| 213 |
|
240 |
|
| 214 |
// Convert the node list to an array of nids |
241 |
// Convert the node list to an array of nids |
| 215 |
rc = list_str_to_array(job->msg->complete_nodelist, &cnt, |
242 |
rc = list_str_to_array(complete_nodelist, &cnt, |
| 216 |
&nodes); |
243 |
&nodes); |
| 217 |
if (rc < 0) { |
244 |
if (rc < 0) { |
| 218 |
xfree(pe_nid_map); |
245 |
xfree(pe_nid_map); |
| 219 |
return NULL; |
246 |
return NULL; |
| 220 |
} else if (job->nnodes != cnt) { |
247 |
} else if (nnodes != cnt) { |
| 221 |
CRAY_ERR("list_str_to_array cnt %d expected %u", |
248 |
CRAY_ERR("list_str_to_array cnt %d expected %u", |
| 222 |
cnt, job->nnodes); |
249 |
cnt, job->nnodes); |
| 223 |
xfree(pe_nid_map); |
250 |
xfree(pe_nid_map); |
|
Lines 227-247
static int *_get_pe_nid_map(stepd_step_rec_t *job)
Link Here
|
| 227 |
|
254 |
|
| 228 |
// Search the task id map for the values we need |
255 |
// Search the task id map for the values we need |
| 229 |
tasks_to_launch_sum = 0; |
256 |
tasks_to_launch_sum = 0; |
| 230 |
for (i = 0; i < job->nnodes; i++) { |
257 |
for (i = 0; i < nnodes; i++) { |
| 231 |
tasks_to_launch_sum += job->msg->tasks_to_launch[i]; |
258 |
tasks_to_launch_sum += tasks_to_launch[i]; |
| 232 |
for (j = 0; j < job->msg->tasks_to_launch[i]; j++) { |
259 |
info("DMJ: tasks_to_launch[%d]: %d, tasks_to_launch_sum: %d", i, tasks_to_launch[i], tasks_to_launch_sum); |
| 233 |
task = job->msg->global_task_ids[i][j]; |
260 |
for (j = 0; j < tasks_to_launch[i]; j++) { |
|
|
261 |
/* inappropriate hack */ |
| 262 |
//task = job->msg->global_task_ids[i][j]; |
| 263 |
task = i * (nnodes - 1) + j; |
| 234 |
pe_nid_map[task] = nodes[i]; |
264 |
pe_nid_map[task] = nodes[i]; |
|
|
265 |
info("DMJ: setting pe_nid_map[%d] = %d, i=%d, j=%d, nnodes=%d", task, nodes[i], i, j, nnodes); |
| 235 |
} |
266 |
} |
| 236 |
} |
267 |
} |
| 237 |
|
268 |
|
| 238 |
// If this is LAM/MPI only one task per node is launched, |
269 |
// If this is LAM/MPI only one task per node is launched, |
| 239 |
// NOT job->ntasks. So fill in the rest of the tasks |
270 |
// NOT job->ntasks. So fill in the rest of the tasks |
| 240 |
// assuming a block distribution |
271 |
// assuming a block distribution |
| 241 |
if (tasks_to_launch_sum == job->nnodes |
272 |
if (tasks_to_launch_sum == nnodes |
| 242 |
&& job->nnodes < job->ntasks) { |
273 |
&& nnodes < ntasks) { |
| 243 |
nid = nodes[0]; // failsafe value |
274 |
nid = nodes[0]; // failsafe value |
| 244 |
for (i = 0; i < job->ntasks; i++) { |
275 |
for (i = 0; i < ntasks; i++) { |
| 245 |
if (pe_nid_map[i] > -1) { |
276 |
if (pe_nid_map[i] > -1) { |
| 246 |
nid = pe_nid_map[i]; |
277 |
nid = pe_nid_map[i]; |
| 247 |
} else { |
278 |
} else { |
|
Lines 252-258
static int *_get_pe_nid_map(stepd_step_rec_t *job)
Link Here
|
| 252 |
xfree(nodes); |
283 |
xfree(nodes); |
| 253 |
|
284 |
|
| 254 |
// Make sure we didn't miss any tasks |
285 |
// Make sure we didn't miss any tasks |
| 255 |
for (i = 0; i < job->ntasks; i++) { |
286 |
for (i = 0; i < ntasks; i++) { |
| 256 |
if (pe_nid_map[i] == -1) { |
287 |
if (pe_nid_map[i] == -1) { |
| 257 |
CRAY_ERR("No NID for PE index %d", i); |
288 |
CRAY_ERR("No NID for PE index %d", i); |
| 258 |
xfree(pe_nid_map); |
289 |
xfree(pe_nid_map); |
|
Lines 271-279
static int *_get_node_cpu_map(stepd_step_rec_t *job)
Link Here
|
| 271 |
int *node_cpu_map; |
302 |
int *node_cpu_map; |
| 272 |
int nodeid; |
303 |
int nodeid; |
| 273 |
|
304 |
|
|
|
305 |
int nnodes = job->nnodes; |
| 306 |
int ntasks = job->ntasks; |
| 307 |
char *complete_nodelist = job->msg->complete_nodelist; |
| 308 |
uint16_t *tasks_to_launch = job->msg->tasks_to_launch; |
| 309 |
if (job->pack_jobid != NO_VAL) { |
| 310 |
nnodes = job->pack_nnodes; |
| 311 |
ntasks = job->pack_ntasks; |
| 312 |
complete_nodelist = job->pack_node_list; |
| 313 |
tasks_to_launch = job->pack_task_cnts; |
| 314 |
} |
| 315 |
|
| 274 |
node_cpu_map = xmalloc(job->nnodes * sizeof(int)); |
316 |
node_cpu_map = xmalloc(job->nnodes * sizeof(int)); |
| 275 |
for (nodeid = 0; nodeid < job->nnodes; nodeid++) { |
317 |
for (nodeid = 0; nodeid < nnodes; nodeid++) { |
| 276 |
node_cpu_map[nodeid] = (job->msg->tasks_to_launch[nodeid] |
318 |
node_cpu_map[nodeid] = (tasks_to_launch[nodeid] |
| 277 |
* job->cpus_per_task); |
319 |
* job->cpus_per_task); |
| 278 |
} |
320 |
} |
| 279 |
|
321 |
|