halo! When trying to open too many threads in a process, pthread_create() normally returns EAGAIN. When the process is being run with valgrind, it returns EINVAL. example session: lemonlime@debian:~$ uname -a Linux debian 2.6.26-2-686 #1 SMP Sun Jun 21 04:57:38 UTC 2009 i686 GNU/Linux lemonlime@debian:~$ cat test2.c #include <stdio.h> #include <pthread.h> #define NTHREADS 500 pthread_t threads [NTHREADS]; void *func (void *__unused) { return NULL; } int main () { int i; for (i = 0; i < NTHREADS; i++) { int err = pthread_create (&threads [i], NULL, func, NULL); if (err) { printf ("*** %d: %s\n", i, strerror (err)); break; } } for (i--; i >= 0; i--) pthread_join (threads [i], NULL); return 0; } lemonlime@debian:~$ gcc -o test2 test2.c -pthread lemonlime@debian:~$ ./test2 *** 381: Resource temporarily unavailable lemonlime@debian:~$ valgrind -v ./test2 ==29487== Memcheck, a memory error detector ==29487== Copyright (C) 2002-2009, and GNU GPL'd, by Julian Seward et al. ==29487== Using Valgrind-3.5.0-Debian and LibVEX; rerun with -h for copyright info ==29487== Command: ./test2 ==29487== --29487-- Valgrind options: --29487-- -v --29487-- Contents of /proc/version: --29487-- Linux version 2.6.26-2-686 (Debian 2.6.26-17) (dannf@debian.org) (gcc version 4.1.3 20080704 (prerelease) (Debian 4.1.2-25)) #1 SMP Sun Jun 21 04:57:38 UTC 2009 --29487-- Arch and hwcaps: X86, x86-sse1-sse2 --29487-- Page sizes: currently 4096, max supported 4096 --29487-- Valgrind library directory: /usr/lib/valgrind --29487-- Reading syms from /lib/ld-2.10.1.so (0x4000000) --29487-- Reading debug info from /lib/ld-2.10.1.so .. --29487-- .. CRC mismatch (computed 6511a23d wanted f9532b0c) --29487-- object doesn't have a symbol table --29487-- Reading syms from /home/lemonlime/test2 (0x8048000) --29487-- Reading syms from /usr/lib/valgrind/memcheck-x86-linux (0x38000000) --29487-- object doesn't have a dynamic symbol table --29487-- Reading suppressions file: /usr/lib/valgrind/default.supp --29487-- Reading syms from /usr/lib/valgrind/vgpreload_core-x86-linux.so (0x4020000) --29487-- Reading syms from /usr/lib/valgrind/vgpreload_memcheck-x86-linux.so (0x4022000) --29487-- Reading syms from /lib/i686/cmov/libpthread-2.10.1.so (0x4042000) --29487-- Reading debug info from /lib/i686/cmov/libpthread-2.10.1.so .. --29487-- .. CRC mismatch (computed 3fa10c68 wanted d9815502) --29487-- Reading syms from /lib/i686/cmov/libc-2.10.1.so (0x405b000) --29487-- Reading debug info from /lib/i686/cmov/libc-2.10.1.so .. --29487-- .. CRC mismatch (computed df32c295 wanted a08f1cf2) --29487-- object doesn't have a symbol table ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400BF88: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40033F2: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A99C: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40033F2: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400B83F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40033F2: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A66E: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40032DC: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A676: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40032DC: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A99C: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40032DC: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== --29487-- REDIR: 0x40ce160 (index) redirected to 0x4025890 (index) --29487-- REDIR: 0x40cfed0 (memchr) redirected to 0x4025fb0 (memchr) --29487-- REDIR: 0x40cec80 (rindex) redirected to 0x4025800 (rindex) --29487-- REDIR: 0x40ca750 (calloc) redirected to 0x4023ed7 (calloc) --29487-- REDIR: 0x40caf40 (free) redirected to 0x40247e1 (free) --29487-- REDIR: 0x40ce880 (strlen) redirected to 0x4025bb0 (strlen) --29487-- REDIR: 0x40d08e0 (memcpy) redirected to 0x4025ff0 (memcpy) --29487-- REDIR: 0x40ceaa0 (strncmp) redirected to 0x4025e20 (strncmp) --29487-- REDIR: 0x40d05c0 (stpcpy) redirected to 0x4026820 (stpcpy) --29487-- REDIR: 0x40d0430 (mempcpy) redirected to 0x4026ba0 (mempcpy) --29487-- REDIR: 0x40d32e0 (strchrnul) redirected to 0x4026b50 (strchrnul) *** 361: Invalid argument ==29487== ==29487== HEAP SUMMARY: ==29487== in use at exit: 0 bytes in 0 blocks ==29487== total heap usage: 361 allocs, 361 frees, 49,096 bytes allocated ==29487== ==29487== All heap blocks were freed -- no leaks are possible ==29487== ==29487== Use --track-origins=yes to see where uninitialised values come from ==29487== ERROR SUMMARY: 13 errors from 6 contexts (suppressed: 0 from 0) ==29487== ==29487== 1 errors in context 1 of 6: ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A99C: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40032DC: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== ==29487== 1 errors in context 2 of 6: ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A676: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40032DC: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== ==29487== 1 errors in context 3 of 6: ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A66E: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40032DC: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== ==29487== 2 errors in context 4 of 6: ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400B83F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40033F2: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== ==29487== 3 errors in context 5 of 6: ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400A99C: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40033F2: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== ==29487== 5 errors in context 6 of 6: ==29487== Conditional jump or move depends on uninitialised value(s) ==29487== at 0x400BF88: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x40033F2: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4014980: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000C7F: ??? (in /lib/ld-2.10.1.so) ==29487== by 0x4000856: ??? (in /lib/ld-2.10.1.so) ==29487== ==29487== ERROR SUMMARY: 13 errors from 6 contexts (suppressed: 0 from 0) To be sure that the problem is with valgrind and not in debian's distribution of it, I downloaded and compiled the latest official release (using default configuration) and the problem is still there: lemonlime@debian:~$ ./testroot/bin/valgrind -v ./test2 ==29987== Memcheck, a memory error detector ==29987== Copyright (C) 2002-2009, and GNU GPL'd, by Julian Seward et al. ==29987== Using Valgrind-3.5.0 and LibVEX; rerun with -h for copyright info ==29987== Command: ./test2 ==29987== --29987-- Valgrind options: --29987-- -v --29987-- Contents of /proc/version: --29987-- Linux version 2.6.26-2-686 (Debian 2.6.26-17) (dannf@debian.org) (gcc version 4.1.3 20080704 (prerelease) (Debian 4.1.2-25)) #1 SMP Sun Jun 21 04:57:38 UTC 2009 --29987-- Arch and hwcaps: X86, x86-sse1-sse2 --29987-- Page sizes: currently 4096, max supported 4096 --29987-- Valgrind library directory: /home/lemonlime/testroot//lib/valgrind --29987-- Reading syms from /lib/ld-2.10.1.so (0x4000000) --29987-- Reading debug info from /lib/ld-2.10.1.so .. --29987-- .. CRC mismatch (computed 6511a23d wanted f9532b0c) --29987-- object doesn't have a symbol table --29987-- Reading syms from /home/lemonlime/test2 (0x8048000) --29987-- Reading syms from /home/lemonlime/testroot/lib/valgrind/memcheck-x86-linux (0x38000000) --29987-- object doesn't have a dynamic symbol table --29987-- Reading suppressions file: /home/lemonlime/testroot//lib/valgrind/default.supp --29987-- Reading syms from /home/lemonlime/testroot/lib/valgrind/vgpreload_core-x86-linux.so (0x4020000) --29987-- Reading syms from /home/lemonlime/testroot/lib/valgrind/vgpreload_memcheck-x86-linux.so (0x4022000) --29987-- Reading syms from /lib/i686/cmov/libpthread-2.10.1.so (0x4042000) --29987-- Reading debug info from /lib/i686/cmov/libpthread-2.10.1.so .. --29987-- .. CRC mismatch (computed 3fa10c68 wanted d9815502) --29987-- Reading syms from /lib/i686/cmov/libc-2.10.1.so (0x405b000) --29987-- Reading debug info from /lib/i686/cmov/libc-2.10.1.so .. --29987-- .. CRC mismatch (computed df32c295 wanted a08f1cf2) --29987-- object doesn't have a symbol table --29987-- REDIR: 0x40ce160 (index) redirected to 0x4025890 (index) --29987-- REDIR: 0x40cfed0 (memchr) redirected to 0x4025fb0 (memchr) --29987-- REDIR: 0x40cec80 (rindex) redirected to 0x4025800 (rindex) --29987-- REDIR: 0x40ca750 (calloc) redirected to 0x4023ed7 (calloc) --29987-- REDIR: 0x40caf40 (free) redirected to 0x40247e1 (free) --29987-- REDIR: 0x40ce880 (strlen) redirected to 0x4025bb0 (strlen) --29987-- REDIR: 0x40d08e0 (memcpy) redirected to 0x4025ff0 (memcpy) --29987-- REDIR: 0x40ceaa0 (strncmp) redirected to 0x4025e20 (strncmp) --29987-- REDIR: 0x40d05c0 (stpcpy) redirected to 0x4026820 (stpcpy) --29987-- REDIR: 0x40d0430 (mempcpy) redirected to 0x4026ba0 (mempcpy) --29987-- REDIR: 0x40d32e0 (strchrnul) redirected to 0x4026b50 (strchrnul) *** 361: Invalid argument ==29987== ==29987== HEAP SUMMARY: ==29987== in use at exit: 0 bytes in 0 blocks ==29987== total heap usage: 361 allocs, 361 frees, 49,096 bytes allocated ==29987== ==29987== All heap blocks were freed -- no leaks are possible ==29987== ==29987== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 18 from 11) --29987-- --29987-- used_suppression: 18 dl-hack3-cond-1 ==29987== ==29987== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 18 from 11)
The sys_clone wrappers do some pretty serious magic in order to retain control of the child thread. I wonder if we are somehow mishandling the return value from sys_clone when it fails.
hi... I'm not sure how much it helps, but I did some more testing, and the failure seems to come from sys_mmap2, and not sys_clone. This is the output of strace of a normal run (without valgrind): (...) clone(child_stack=0xbe727494, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0xbe727bd8, {entry_number:6, base_addr:0xbe727b70, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}, child_tidptr=0xbe727bd8) = 23291 mmap2(NULL, 8392704, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0xbe728000 mprotect(0xbe728000, 4096, PROT_NONE) = 0 clone(child_stack=0xbef28494, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0xbef28bd8, {entry_number:6, base_addr:0xbef28b70, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}, child_tidptr=0xbef28bd8) = 23292 mmap2(NULL, 8392704, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0xbef29000 mprotect(0xbef29000, 4096, PROT_NONE) = 0 clone(child_stack=0xbf729494, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0xbf729bd8, {entry_number:6, base_addr:0xbf729b70, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}, child_tidptr=0xbf729bd8) = 23293 mmap2(NULL, 8392704, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = -1 ENOMEM (Cannot allocate memory) munmap(0xbef29000, 8392704) = 0 munmap(0xbe728000, 8392704) = 0 munmap(0xbdf27000, 8392704) = 0 (...) And this is the output of valgrind --trace-syscalls=yes: (...) SYSCALL[26205,1](120) sys_clone ( 3d0f00, 0xbd753494, 0xbd753bd8, 0xbee4638c, 0xbd753bd8 ) --> [pre-success] Success(0x0:0x67c9) SYSCALL[26205,2](311) sys_set_robust_list ( 0xbd753be0, 12 )[sync] --> Success(0x0:0x0) SYSCALL[26205,2]( 1) exit( 0 ) --> [pre-success] Success(0x0:0x0) SYSCALL[26205,1](192) sys_mmap2 ( 0x0, 8392704, 3, 131106, -1, 0 ) --> [pre-success] Success(0x0:0xbd754000) SYSCALL[26205,1](125) sys_mprotect ( 0xbd754000, 4096, 0 )[sync] --> Success(0x0:0x0) SYSCALL[26205,1](120) sys_clone ( 3d0f00, 0xbdf54494, 0xbdf54bd8, 0xbee4638c, 0xbdf54bd8 ) --> [pre-success] Success(0x0:0x67ca) SYSCALL[26205,2](311) sys_set_robust_list ( 0xbdf54be0, 12 )[sync] --> Success(0x0:0x0) SYSCALL[26205,2]( 1) exit( 0 ) --> [pre-success] Success(0x0:0x0) SYSCALL[26205,1](192) sys_mmap2 ( 0x0, 8392704, 3, 131106, -1, 0 ) --> [pre-success] Success(0x0:0xbee47000) SYSCALL[26205,1](125) sys_mprotect ( 0xbee47000, 4096, 0 )[sync] --> Success(0x0:0x0) SYSCALL[26205,1](120) sys_clone ( 3d0f00, 0xbf647494, 0xbf647bd8, 0xbee4638c, 0xbf647bd8 ) --> [pre-success] Success(0x0:0x67cb) SYSCALL[26205,2](311) sys_set_robust_list ( 0xbf647be0, 12 )[sync] --> Success(0x0:0x0) SYSCALL[26205,2]( 1) exit( 0 ) --> [pre-success] Success(0x0:0x0) SYSCALL[26205,1](192) sys_mmap2 ( 0x0, 8392704, 3, 131106, -1, 0 ) --> [pre-fail] Failure(0x16) SYSCALL[26205,1]( 4) sys_write ( 1, 0x4029000, 26 ) --> [async] ... *** 361: Invalid argument SYSCALL[26205,1]( 4) ... [async] --> Success(0x0:0x1a) SYSCALL[26205,1]( 91) sys_munmap ( 0xbee47000, 8392704 )[sync] --> Success(0x0:0x0) SYSCALL[26205,1]( 91) sys_munmap ( 0xbd754000, 8392704 )[sync] --> Success(0x0:0x0) (...) In both cases, clone() succeeds but the following mmap2() fails.
hi... The problem is that ML_(generic_PRE_sys_mmap) returns EINVAL when VG_(am_get_advisory) fails. My understanding is that if VG_(am_get_advisory) fails, this means we are out of memory and therefore the caller should return ENOMEM. If I am correct, this patch should fix the problem: diff -rupN src/valgrind-3.5.0/coregrind/m_syswrap/syswrap-generic.c testsrc/valgrind-3.5.0/coregrind/m_syswrap/syswrap-generic.c --- src/valgrind-3.5.0/coregrind/m_syswrap/syswrap-generic.c 2009-08-19 16:37:42.000000000 +0300 +++ testsrc/valgrind-3.5.0/coregrind/m_syswrap/syswrap-generic.c 2009-11-28 11:53:46.000000000 +0200 @@ -1994,7 +1994,7 @@ ML_(generic_PRE_sys_mmap) ( ThreadId tid advised = VG_(am_get_advisory)( &mreq, True/*client*/, &mreq_ok ); if (!mreq_ok) { /* Our request was bounced, so we'd better fail. */ - return VG_(mk_SysRes_Error)( VKI_EINVAL ); + return VG_(mk_SysRes_Error)( VKI_ENOMEM ); } /* Otherwise we're OK (so far). Install aspacem's choice of When applying this and recompiling, the testcase above runs correctly and outputs "Resource temporarily unavailable" as it should.