Memcheck crashes on the Cortex A9 Armv7 processor before executing IR-to-host emitted code because valgrind executes opcodes unsupported by this processor. The ARM core used does not have SIMD (neon) nor VFP extensions. From the arm reference manual: "Advanced SIMD and VFP are two optional extensions to ARMv7". The core I tested doesnt have those options, just the embedded FPU. After posting to users list, Julian told me to post the bug and my patch here. A debug session as shown in note (2) below shows that the problem for my board occurs in coregrind/m_dispatch/dispatch-arm-linux.S: #if defined(VGP_arm_linux) .fpu vfp .... VG_(run_innerloop): push {r0, r1, r4, r5, r6, r7, r8, r9, fp, lr} /* set FPSCR to vex-required default value */ mov r4, #0 fmxr fpscr, r4 <== illegal instruction here as reported by GDB @ line 61 The failure makes sense - the fpscr register is a SIMD/vfp extension register and fmxr (now replaced by 'vmsr') is invalid without those extensions. Following the gory details in Note 1 and Note 2 is the patch that resolves the problem by using the same SIGILL structures for ARM in place, extended to the VMSR (fmxr) instruction. Note 1) Failure with the Cortex A9: # valgrind /usr/bin/find ==753== Memcheck, a memory error detector ==753== Copyright (C) 2002-2010, and GNU GPL'd, by Julian Seward et al. ==753== Using Valgrind-3.6.1 and LibVEX; rerun with -h for copyright info ==753== Command: /usr/bin/find ==753== ==753== ==753== Process terminating with default action of signal 4 (SIGILL) ==753== Illegal opcode at address 0x38143358 ==753== at 0x4000790: ??? (in /lib/ld-2.11.1.so) ==753== ==753== HEAP SUMMARY: ==753== in use at exit: 0 bytes in 0 blocks ==753== total heap usage: 0 allocs, 0 frees, 0 bytes allocated ==753== ==753== All heap blocks were freed -- no leaks are possible ==753== ==753== For counts of detected and suppressed errors, rerun with: -v ==753== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0) Illegal instruction Comments: The application address (never actually emitted/executes) 0x4000790 is at _start in ld.so objdump -d ld-2.11.1.so .... Disassembly of section .text: 00000790 <_start>: 790: f8df a068 ldr.w sl, [pc, #104] ; 7fc <_dl_start_user+0x5e> 794: f8df 4068 ldr.w r4, [pc, #104] ; 800 <_dl_start_user+0x62> 798: 4668 mov r0, sp 79a: f000 f9fd bl b98 <_dl_start> Note 2) Isolating failure in gdb session term1> valgrind --wait-for-gdb=yes /usr/bin/find term2> handle SIGILL SIGSEGV nostop noprint >> .gdbinit gdb /usr/lib/valgrind/memcheck-ppc32-linux 231 ... (gdb) c Continuing. Program received signal SIGILL, Illegal instruction. vgPlain_run_innerloop () at m_dispatch/dispatch-arm-linux.S:61 61 m_dispatch/dispatch-arm-linux.S: No such file or directory. in m_dispatch/dispatch-arm-linux.S Comments: the location is the unimplemented opcode on the unimplemented register. Patch =============================== Don't try to cleanup the fpscr register that is not available and has no opcode support for armv7 processors that are without the SIMD and VFP extensions, since the fmxr/fmrx instructions will throw a SIGILL, illegal opcode, killing valgrind before it executes the first instruction of the application under test. Signed-off-by: Dave Lerner <dave.lerner@windriver.com> --- VEX/pub/libvex.h | 1 coregrind/m_dispatch/dispatch-arm-linux.S | 11 +++++ coregrind/m_machine.c | 19 ++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) --- a/VEX/pub/libvex.h +++ b/VEX/pub/libvex.h @@ -111,6 +111,7 @@ typedef #define VEX_HWCAPS_ARM_VFP (1<<6) /* VFP extension */ #define VEX_HWCAPS_ARM_VFP2 (1<<7) /* VFPv2 */ #define VEX_HWCAPS_ARM_VFP3 (1<<8) /* VFPv3 */ +#define VEX_HWCAPS_ARM_FPSCR (1<<9) /* FPSCR register (VFPv2,v3 or NEON) */ /* Bits 15:10 reserved for (possible) future VFP revisions */ #define VEX_HWCAPS_ARM_NEON (1<<16) /* Advanced SIMD also known as NEON */ --- a/coregrind/m_dispatch/dispatch-arm-linux.S +++ b/coregrind/m_dispatch/dispatch-arm-linux.S @@ -56,10 +56,16 @@ UWord VG_(run_innerloop) ( void* guest_s VG_(run_innerloop): push {r0, r1, r4, r5, r6, r7, r8, r9, fp, lr} + ldr r4, =VG_(machine_arm_has_fpscr) + ldr r4, [r4] + cmp r4, #0 + beq no_fpscr_setup + /* set FPSCR to vex-required default value */ mov r4, #0 fmxr fpscr, r4 +no_fpscr_setup: /* r0 (hence also [sp,#0]) holds guest_state */ /* r1 holds do_profiling */ mov r8, r0 @@ -212,6 +218,11 @@ fast_lookup_failed: the return value. */ run_innerloop_exit: + ldr r4, =VG_(machine_arm_has_fpscr) + ldr r4, [r4] + cmp r4, #0 + beq run_innerloop_exit_REALLY + /* We're leaving. Check that nobody messed with FPSCR in ways we don't expect. */ fmrx r4, fpscr --- a/coregrind/m_machine.c +++ b/coregrind/m_machine.c @@ -413,6 +413,7 @@ ULong VG_(machine_ppc64_has_VMX) = 0; #endif #if defined(VGA_arm) Int VG_(machine_arm_archlevel) = 4; +UInt VG_(machine_arm_has_fpscr) = 0; #endif /* fixs390: anything for s390x here ? */ @@ -965,7 +966,7 @@ Bool VG_(machine_get_hwcaps)( void ) vki_sigaction_fromK_t saved_sigill_act, saved_sigfpe_act; vki_sigaction_toK_t tmp_sigill_act, tmp_sigfpe_act; - volatile Bool have_VFP, have_VFP2, have_VFP3, have_NEON; + volatile Bool have_VFP, have_VFP2, have_VFP3, have_NEON, have_FPSCR; volatile Int archlevel; Int r; @@ -1025,6 +1026,14 @@ Bool VG_(machine_get_hwcaps)( void ) __asm__ __volatile__(".word 0xF2244154"); /* VMOV q2, q2 */ } + /* VFP2, VFP3, or SIMD Extension FPSCR register */ + have_FPSCR = True; + if (__builtin_setjmp(env_unsup_insn)) { + have_FPSCR = False; + } else { + __asm__ __volatile__(".word 0xEEE12C10"); /* VMSR FPSCR, r2 */ + } + /* ARM architecture level */ archlevel = 5; /* v5 will be base level */ if (archlevel < 7) { @@ -1050,11 +1059,13 @@ Bool VG_(machine_get_hwcaps)( void ) VG_(sigaction)(VKI_SIGFPE, &tmp_sigfpe_act, NULL); VG_(sigprocmask)(VKI_SIG_SETMASK, &saved_set, NULL); - VG_(debugLog)(1, "machine", "ARMv%d VFP %d VFP2 %d VFP3 %d NEON %d\n", + VG_(debugLog)(1, "machine", + "ARMv%d VFP %d VFP2 %d VFP3 %d NEON %d FPSCR %d\n", archlevel, (Int)have_VFP, (Int)have_VFP2, (Int)have_VFP3, - (Int)have_NEON); + (Int)have_NEON, (Int) have_FPSCR); VG_(machine_arm_archlevel) = archlevel; + VG_(machine_arm_has_fpscr) = have_FPSCR; va = VexArchARM; @@ -1063,6 +1074,8 @@ Bool VG_(machine_get_hwcaps)( void ) if (have_VFP2) vai.hwcaps |= VEX_HWCAPS_ARM_VFP2; if (have_VFP) vai.hwcaps |= VEX_HWCAPS_ARM_VFP; if (have_NEON) vai.hwcaps |= VEX_HWCAPS_ARM_NEON; + if (have_FPSCR) vai.hwcaps |= VEX_HWCAPS_ARM_FPSCR; + return True; }
Hmm, an ARM core with neither VFP nor Neon. Not sure it's worth the added complexity and verification hassle to support. What system/SOC does this core appear in?
(In reply to Julian Seward from comment #1) > Hmm, an ARM core with neither VFP nor Neon. Not sure it's worth > the added complexity and verification hassle to support. What > system/SOC does this core appear in? I apologize that I didn't respond to your comment sooner, either here or on the valgrind users list (http://thread.gmane.org/gmane.comp.debugging.valgrind/11511/focus=11519). I have just received another instance of the crash and a request for this patch, and note that others have seen the problem in the thread above. I had to resolve the problem for the Wind River Linux 4.3 release was for the stm spear13xx family of processors. The GCC developers told me that a gcc compile time test was NOT appropriate, only a run-time test could establish whether or not the FPSCR register was implemented. This patch was originally written for valgrind 3.6.1 and the patch was not implemented as of val 3.9.0. Wind River carries the patch for versions 4.3 through 5.1. Is there a possibility that you can add this patch to future releases?
Created attachment 167661 [details] Support for ARMv7 w/o NEON and VFP Hi, I've cleaned up the patch a bit so it applies cleanly on latest Valgrind. I've also removed the dynamic detection of the FPSCR since it might create issues on cores with functional capabilities, instead we check if NEON or VFP is supported. BR, Markus
Wouldn't it be better to do this check at compile time?
(In reply to Paul Floyd from comment #4) > Wouldn't it be better to do this check at compile time? Interesting idea... I guess it could be done, but since there already exist dynamic detection of both VFP and NEON capabilities we can just piggyback on them and make the code less intrusive. I have seen some asymmetrical systems with multiple cores, where only one core has the NEON extension. I'll give it some testing. BR, Markus
(In reply to Paul Floyd from comment #4) > Wouldn't it be better to do this check at compile time? The reason why we need run-time detection is that openssl and other code uses run-time detection of VFP/NEON extensions. I've an AArch64 setup which runs AArch64 in kernel-space and EABI5 / AArch32 in user-space. Two different toolchains, the SoC supports both VFP and NEON and this leads to unwanted consequences. Hence we will need to follow the rest of the code base to guarantee the code will work as expected. BR, Markus
Comment on attachment 167661 [details] Support for ARMv7 w/o NEON and VFP From 8fbd691e7f8a79983b8dbec1ee89cdf32d6b3c41 Mon Sep 17 00:00:00 2001 From: Dave Lerner <dave.lerner@windriver.com> Date: Fri, 22 Mar 2024 17:34:15 +0100 Subject: [PATCH] Support ARMv7 w/o NEON and VFP Don't try to cleanup the fpscr register that is not available and has no opcode support for armv7 processors that are without the SIMD and VFP extensions, since the fmxr/fmrx instructions will throw a SIGILL, illegal opcode, killing valgrind before it executes the first instruction of the application under test. Make it possible to disable NEON and VFP auto-detection on ARMv7. There are asymmetrical devices where one or more cores doesn't support the NEON and VFP extensions. These devices needs a special workaround. Signed-off-by: Dave Lerner <dave.lerner@windriver.com> Co-authored-by: Markus Gothe <markus.gothe@genexis.eu> --- configure.ac | 9 +++++++++ coregrind/m_dispatch/dispatch-arm-linux.S | 11 +++++++++++ coregrind/m_machine.c | 13 ++++++++++--- coregrind/pub_core_machine.h | 1 + 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 8279683ac..0d0a35089 100755 --- a/configure.ac +++ b/configure.ac @@ -3789,6 +3789,15 @@ CFLAGS="$save_CFLAGS" AM_CONDITIONAL(BUILD_ARMV82_DOTPROD_TESTS, test x$ac_have_armv82_dotprod_feature = xyes) +# Enable VFP and NEON auto-detection on ARM +AC_CACHE_CHECK([for VFP auto-detection on ARM], vg_cv_arm_vfp_autodetection, [vg_cv_arm_vfp_autodetection=yes]) +AC_CACHE_CHECK([for NEON auto-detection on ARM], vg_cv_arm_neon_autodetection, [vg_cv_arm_neon_autodetection=yes]) +if test "x${vg_cv_arm_vfp_autodetection}" = "xyes"; then + AC_DEFINE(ENABLE_ARM_VFP, 1, [Define to 1 if auto-detection of ARM VFP capabilities should be used.]) +fi +if test "x${vg_cv_arm_neon_autodetection}" = "xyes"; then + AC_DEFINE(ENABLE_ARM_NEON, 1, [Define to 1 if auto-detection of ARM NEON capabilities should be used.]) +fi # XXX JRS 2010 Oct 13: what is this for? For sure, we don't need this # when building the tool executables. I think we should get rid of it. diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S index 40629017c..284df0513 100644 --- a/coregrind/m_dispatch/dispatch-arm-linux.S +++ b/coregrind/m_dispatch/dispatch-arm-linux.S @@ -65,10 +65,16 @@ VG_(disp_run_translations): order to keep the stack 8-aligned. */ push {r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + ldr r4, =VG_(machine_arm_has_fpscr) + ldr r4, [r4] + cmp r4, #0 + beq no_fpscr_setup + /* set FPSCR to vex-required default value */ mov r4, #0 fmxr fpscr, r4 +no_fpscr_setup: /* Set up the guest state pointer */ mov r8, r1 @@ -93,6 +99,11 @@ postamble: /* We're leaving. Check that nobody messed with FPSCR in ways we don't expect. */ + ldr r4, =VG_(machine_arm_has_fpscr) + ldr r4, [r4] + cmp r4, #0 + beq remove_frame + fmrx r4, fpscr bic r4, #0xF8000000 /* mask out NZCV and QC */ bic r4, #0x0000009F /* mask out IDC,IXC,UFC,OFC,DZC,IOC */ diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c index a4c2218bf..8fc82eb81 100644 --- a/coregrind/m_machine.c +++ b/coregrind/m_machine.c @@ -472,6 +472,7 @@ ULong VG_(machine_ppc64_has_VMX) = 0; #endif #if defined(VGA_arm) Int VG_(machine_arm_archlevel) = 4; +UInt VG_(machine_arm_has_fpscr) = 0; #endif @@ -1673,7 +1674,7 @@ Bool VG_(machine_get_hwcaps)( void ) tmp_sigfpe_act.sa_flags |= VKI_SA_NODEFER; tmp_sigfpe_act.ksa_handler = handler_unsup_insn; VG_(sigaction)(VKI_SIGFPE, &tmp_sigfpe_act, NULL); - +#ifdef ENABLE_ARM_VFP /* VFP insns */ have_VFP = True; if (VG_MINIMAL_SETJMP(env_unsup_insn)) { @@ -1681,11 +1682,14 @@ Bool VG_(machine_get_hwcaps)( void ) } else { __asm__ __volatile__(".word 0xEEB02B42"); /* VMOV.F64 d2, d2 */ } +#else + have_VFP = False; +#endif /* There are several generation of VFP extension but they differs very little so for now we will not distinguish them. */ have_VFP2 = have_VFP; have_VFP3 = have_VFP; - +#ifdef ENABLE_ARM_NEON /* NEON insns */ have_NEON = True; if (VG_MINIMAL_SETJMP(env_unsup_insn)) { @@ -1693,7 +1697,9 @@ Bool VG_(machine_get_hwcaps)( void ) } else { __asm__ __volatile__(".word 0xF2244154"); /* VMOV q2, q2 */ } - +#else + have_NEON = False; +#endif /* ARM architecture level */ archlevel = 5; /* v5 will be base level */ if (archlevel < 7) { @@ -1737,6 +1743,7 @@ Bool VG_(machine_get_hwcaps)( void ) (Int)have_NEON); VG_(machine_arm_archlevel) = archlevel; + VG_(machine_arm_has_fpscr) = (have_VFP || have_NEON) ? 1 : 0; va = VexArchARM; vai.endness = VexEndnessLE; diff --git a/coregrind/pub_core_machine.h b/coregrind/pub_core_machine.h index a9b7dd8b1..c8085df86 100644 --- a/coregrind/pub_core_machine.h +++ b/coregrind/pub_core_machine.h @@ -293,6 +293,7 @@ extern ULong VG_(machine_ppc64_has_VMX); #if defined(VGA_arm) extern Int VG_(machine_arm_archlevel); +extern UInt VG_(machine_arm_has_fpscr); #endif #endif // __PUB_CORE_MACHINE_H -- 2.43.2
Created attachment 167713 [details] Updated attachement Updated attachment so the end-user can override configure variables to disable the ARM NEON and VFP checks, for asymmetrical systems.
I had a look at other platforms and PPC does something similar to handle older and newer hardware. So perhaps it's not so bad. I'll try to run the perf suite to compare with and without (on an RPi 5).