Large jobs 32000 MPI processes and 64 processes MPI per node are failing with such slurmd messages after upgrade from 17.11.6 to 18.08.6 slurmd failure error: unpackmem_xmalloc: Buffer to be unpacked is too large (14778976 > 10000000) Comparing old and new versions it seems that an appropriated test is done in src/common/pack.c in functions unpackmem_ptr, unpackmem, unpackmem_xmalloc, unpackmem_malloc The compared size MAX_PACK_MEM_LEN (1024*1024*1024) has been replaced by MAX_ARRAY_LEN_LARGE (10000000). This is appropriated for array but no for mem buffer as in packmem max size of buffer to be packed is MAX_PACK_MEM_LEN. We are currently trying this patch in src/common/pack.c diff -rau pack.c.18.08.6 pack.c --- pack.c.18.08.6 2019-09-13 14:36:05.398824000 +0200 +++ pack.c 2019-09-13 14:38:55.229869000 +0200 @@ -782,7 +782,7 @@ *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_LARGE) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", __func__, *size_valp, MAX_ARRAY_LEN_LARGE); return SLURM_ERROR; @@ -817,7 +817,7 @@ *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_LARGE) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", __func__, *size_valp, MAX_ARRAY_LEN_LARGE); return SLURM_ERROR; @@ -852,7 +852,7 @@ *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_LARGE) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", __func__, *size_valp, MAX_ARRAY_LEN_LARGE); return SLURM_ERROR; @@ -888,7 +888,7 @@ memcpy(&ns, &buffer->head[buffer->processed], sizeof(ns)); *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_SMALL) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", __func__, *size_valp, MAX_ARRAY_LEN_SMALL); return SLURM_ERROR; Regards Regine Gaudin
Hi Regine, I have another related issue with the same problem. The fact that we're not using MAX_PACK_MEM_LEN as is is because a malloc can fail if it is too big. A temporary fix is just to increase the order of magnitude of MAX_ARRAY_LEN_LARGE by 1. I am reevaluating what is better, whether to be consistent with MAX_PACK_MEM_LEN or just limit the malloc to another value. Will inform when it is fixed. Thanks for reporting. Besides this, you missed some parts on your patch: diff --git a/src/common/pack.c b/src/common/pack.c index b995d77a0c..c92d1df757 100644 --- a/src/common/pack.c +++ b/src/common/pack.c @@ -860,9 +860,9 @@ int unpackmem_ptr(char **valp, uint32_t * size_valp, Buf buffer) *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_LARGE) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", - __func__, *size_valp, MAX_ARRAY_LEN_LARGE); + __func__, *size_valp, MAX_PACK_MEM_LEN); return SLURM_ERROR; } else if (*size_valp > 0) { @@ -895,9 +895,9 @@ int unpackmem(char *valp, uint32_t * size_valp, Buf buffer) *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_LARGE) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", - __func__, *size_valp, MAX_ARRAY_LEN_LARGE); + __func__, *size_valp, MAX_PACK_MEM_LEN); return SLURM_ERROR; } else if (*size_valp > 0) { @@ -930,9 +930,9 @@ int unpackmem_xmalloc(char **valp, uint32_t * size_valp, Buf buffer) *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_LARGE) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", - __func__, *size_valp, MAX_ARRAY_LEN_LARGE); + __func__, *size_valp, MAX_PACK_MEM_LEN); return SLURM_ERROR; } else if (*size_valp > 0) { @@ -966,9 +966,9 @@ int unpackmem_malloc(char **valp, uint32_t * size_valp, Buf buffer) memcpy(&ns, &buffer->head[buffer->processed], sizeof(ns)); *size_valp = ntohl(ns); buffer->processed += sizeof(ns); - if (*size_valp > MAX_ARRAY_LEN_SMALL) { + if (*size_valp > MAX_PACK_MEM_LEN) { error("%s: Buffer to be unpacked is too large (%u > %u)", - __func__, *size_valp, MAX_ARRAY_LEN_SMALL); + __func__, *size_valp, MAX_PACK_MEM_LEN); return SLURM_ERROR; } else if (*size_valp > 0) {
Definitively we want to limit the max unpacked size. One with bad intentions could create some code that sends a message to slurmctld with large data in the packet and could try to crash the controller. The way to protect this is to limit the unpack function. I have a patch pending for review to increase one order of magnitude of MAX_ARRAY_LEN_LARGE. I am now marking this issue as duplicate of a private one we already have. You should receive a notification when the issue is fixed. For the moment and to be safe you can increase MAX_ARRAY_LEN_LARGE: -#define MAX_ARRAY_LEN_LARGE 10000000 +#define MAX_ARRAY_LEN_LARGE 100000000 *** This ticket has been marked as a duplicate of ticket 7495 ***