Power 9 application is receiving the error Pool = TEMP, start 0x5861af28 curr 0x58adfa48 end 0x58adfa67 (size 5000000) vex: the `impossible' happened: VEX temporary storage exhausted. Increase N_{TEMPORARY,PERMANENT}_BYTES and recompile. I increased the N_TEMPORARY_BYTES and N_PERMANENT_BYTES #defines as given below. --- a/VEX/priv/main_util.c +++ b/VEX/priv/main_util.c @@ -55,10 +55,10 @@ #if defined(ENABLE_INNER) /* 5 times more memory to be on the safe side: consider each allocation is 8 bytes, and we need 16 bytes redzone before and after. */ -#define N_TEMPORARY_BYTES (5*5000000) +#define N_TEMPORARY_BYTES (5*2000000000) static Bool mempools_created = False; #else -#define N_TEMPORARY_BYTES 5000000 +#define N_TEMPORARY_BYTES 2000000000 #endif static HChar temporary[N_TEMPORARY_BYTES] __attribute__((aligned(REQ_ALIGN))); @@ -70,9 +70,9 @@ static ULong temporary_bytes_allocd_TOT = 0; #if defined(ENABLE_INNER) /* See N_TEMPORARY_BYTES */ -#define N_PERMANENT_BYTES (5*10000) +#define N_PERMANENT_BYTES (5*100000) #else -#define N_PERMANENT_BYTES 10000 +#define N_PERMANENT_BYTES 100000 Once these were increased both workloads then hit the error: x264 [info]: profile High, level 3.1 vex: priv/host_generic_reg_alloc3.c:470 (doRegisterAllocation_v3): Assertion `instrs_in->arr_used <= 15000' failed. vex storage: T total 373013384 bytes allocated vex storage: P total 192 bytes allocated The issue appears to be the same issue seen on arm64 in bugzilla https://bugs.kde.org/show_bug.cgi?id=375839 Per this bugzill, it was found that using the command line option --vex-guest-max-insns=2 allowed the application to run but does appear to run with a significant performance hit.
The first workload is a video workload. When Run with the trace-flags, the issue seems to be with the expansion of the xxperm instruction. I added vex_printf("CARLL, set dres->hint\n"); dres->hint = Dis_HintVerbose; to the code for the xxperm instruction. I verified that the hint was being set. The expansion. With the hint, we get a basic block with 32 instructions. There are four xxperm instructions in the basic block. The expansion is: 0x10076664: xxperm v38,v32,v37 CARLL, set dres->hint ------ IMark(0x10076664, 4, 0) ------ t66 = GET:V128(784) t67 = GET:V128(864) t65 = GET:V128(880) t70 = 64HLtoV128(0x0:I64,0x0:I64) t68 = 64HLtoV128(0x0:I64,0x1F:I64) t69 = 64HLtoV128(0x0:I64,0xFF:I64) t72 = ShrV128(AndV128(t67,ShlV128(t68,0x78:I8)),0x78:I8) t71 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t72,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t72,0x4:I8))))) t73 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t72),0xF:I64))),0x8:I32)) t74 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t71),ShlV128(t69,t73))),AndV128(t65,AndV128(t71,ShlV128(t69,t73)))),t73),0x78:I8) t75 = OrV128(t74,t70) t77 = ShrV128(AndV128(t67,ShlV128(t68,0x70:I8)),0x70:I8) t76 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t77,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t77,0x4:I8))))) t78 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t77),0xF:I64))),0x8:I32)) t79 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t76),ShlV128(t69,t78))),AndV128(t65,AndV128(t76,ShlV128(t69,t78)))),t78),0x70:I8) t80 = OrV128(t79,t75) t82 = ShrV128(AndV128(t67,ShlV128(t68,0x68:I8)),0x68:I8) t81 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t82,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t82,0x4:I8))))) t83 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t82),0xF:I64))),0x8:I32)) t84 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t81),ShlV128(t69,t83))),AndV128(t65,AndV128(t81,ShlV128(t69,t83)))),t83),0x68:I8) t85 = OrV128(t84,t80) t87 = ShrV128(AndV128(t67,ShlV128(t68,0x60:I8)),0x60:I8) t86 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t87,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t87,0x4:I8))))) t88 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t87),0xF:I64))),0x8:I32)) t89 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t86),ShlV128(t69,t88))),AndV128(t65,AndV128(t86,ShlV128(t69,t88)))),t88),0x60:I8) t90 = OrV128(t89,t85) t92 = ShrV128(AndV128(t67,ShlV128(t68,0x58:I8)),0x58:I8) t91 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t92,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t92,0x4:I8))))) t93 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t92),0xF:I64))),0x8:I32)) t94 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t91),ShlV128(t69,t93))),AndV128(t65,AndV128(t91,ShlV128(t69,t93)))),t93),0x58:I8) t95 = OrV128(t94,t90) t97 = ShrV128(AndV128(t67,ShlV128(t68,0x50:I8)),0x50:I8) t96 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t97,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t97,0x4:I8))))) t98 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t97),0xF:I64))),0x8:I32)) t99 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t96),ShlV128(t69,t98))),AndV128(t65,AndV128(t96,ShlV128(t69,t98)))),t98),0x50:I8) t100 = OrV128(t99,t95) t102 = ShrV128(AndV128(t67,ShlV128(t68,0x48:I8)),0x48:I8) t101 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t102,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t102,0x4:I8))))) t103 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t102),0xF:I64))),0x8:I32)) t104 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t101),ShlV128(t69,t103))),AndV128(t65,AndV128(t101,ShlV128(t69,t103)))),t103),0x48:I8) t105 = OrV128(t104,t100) t107 = ShrV128(AndV128(t67,ShlV128(t68,0x40:I8)),0x40:I8) t106 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t107,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t107,0x4:I8))))) t108 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t107),0xF:I64))),0x8:I32)) t109 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t106),ShlV128(t69,t108))),AndV128(t65,AndV128(t106,ShlV128(t69,t108)))),t108),0x40:I8) t110 = OrV128(t109,t105) t112 = ShrV128(AndV128(t67,ShlV128(t68,0x38:I8)),0x38:I8) t111 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t112,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t112,0x4:I8))))) t113 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t112),0xF:I64))),0x8:I32)) t114 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t111),ShlV128(t69,t113))),AndV128(t65,AndV128(t111,ShlV128(t69,t113)))),t113),0x38:I8) t115 = OrV128(t114,t110) t117 = ShrV128(AndV128(t67,ShlV128(t68,0x30:I8)),0x30:I8) t116 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t117,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t117,0x4:I8))))) t118 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t117),0xF:I64))),0x8:I32)) t119 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t116),ShlV128(t69,t118))),AndV128(t65,AndV128(t116,ShlV128(t69,t118)))),t118),0x30:I8) t120 = OrV128(t119,t115) t122 = ShrV128(AndV128(t67,ShlV128(t68,0x28:I8)),0x28:I8) t121 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t122,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t122,0x4:I8))))) t123 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t122),0xF:I64))),0x8:I32)) t124 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t121),ShlV128(t69,t123))),AndV128(t65,AndV128(t121,ShlV128(t69,t123)))),t123),0x28:I8) t125 = OrV128(t124,t120) t127 = ShrV128(AndV128(t67,ShlV128(t68,0x20:I8)),0x20:I8) t126 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t127,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t127,0x4:I8))))) t128 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t127),0xF:I64))),0x8:I32)) t129 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t126),ShlV128(t69,t128))),AndV128(t65,AndV128(t126,ShlV128(t69,t128)))),t128),0x20:I8) t130 = OrV128(t129,t125) t132 = ShrV128(AndV128(t67,ShlV128(t68,0x18:I8)),0x18:I8) t131 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t132,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t132,0x4:I8))))) t133 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t132),0xF:I64))),0x8:I32)) t134 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t131),ShlV128(t69,t133))),AndV128(t65,AndV128(t131,ShlV128(t69,t133)))),t133),0x18:I8) t135 = OrV128(t134,t130) t137 = ShrV128(AndV128(t67,ShlV128(t68,0x10:I8)),0x10:I8) t136 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t137,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t137,0x4:I8))))) t138 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t137),0xF:I64))),0x8:I32)) t139 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t136),ShlV128(t69,t138))),AndV128(t65,AndV128(t136,ShlV128(t69,t138)))),t138),0x10:I8) t140 = OrV128(t139,t135) t142 = ShrV128(AndV128(t67,ShlV128(t68,0x8:I8)),0x8:I8) t141 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t142,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t142,0x4:I8))))) t143 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t142),0xF:I64))),0x8:I32)) t144 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t141),ShlV128(t69,t143))),AndV128(t65,AndV128(t141,ShlV128(t69,t143)))),t143),0x8:I8) t145 = OrV128(t144,t140) t147 = ShrV128(AndV128(t67,ShlV128(t68,0x0:I8)),0x0:I8) t146 = 64HLtoV128(1Sto64(64to1(V128to64(ShrV128(t147,0x4:I8)))),1Sto64(64to1(V128to64(ShrV128(t147,0x4:I8))))) t148 = 32to8(Mul32(Sub32(0xF:I32,64to32(And64(V128to64(t147),0xF:I64))),0x8:I32)) t149 = ShlV128(ShrV128(OrV128(AndV128(t66,AndV128(NotV128(t146),ShlV128(t69,t148))),AndV128(t65,AndV128(t146,ShlV128(t69,t148)))),t148),0x0:I8) t150 = OrV128(t149,t145) PUT(880) = t150 PUT(1296) = 0x10076668:I64 The block has 1312 temporaries before instrumentation. With instrumentation we have 4920 temporaries. So, even with the hint, there are too many instructions in the BB.
The second workload that hits the same issue is a GCC instruction test. The test runs a whole bunch of float 128-bit instructions to test the code generation. In this case, there are several instructions: xsmulqp, xsmaddqp, xssubqp, xsdivqp, xsaddqp which all call the function generate_store_FPRF( Ity_F128, vT ); to set the condition code for the instruction. The function expands into: t125 = Or32(8Uto32(GET:I8(1342)),Shl32(8Uto32(GET:I8(1344)),0xC:I8)) t124 = And32(t125,0x3:I32) t122 = GET:F128(896) t121 = GET:F128(912) t123 = AddF128(Xor32(t124,And32(Shl32(t124,0x1:I8),0x2:I32)),t121,t122) t134 = 64HLtoV128(ReinterpF64asI64(F128HItoF64(t123)),ReinterpF64asI64(F128LOtoF64(t123))) t133 = 64to1(And64(Shr64(V128HIto64(t134),0x3F:I8),0x1:I64)) t136 = 0x7FFF000000000000:I64 t135 = 0xFFFFFFFFFFFF:I64 t137 = 0x0:I64 t126 = 32to1(And32(1Uto32(CmpEQ64(And64(V128HIto64(t134),t136),t136)),1Uto32(Not1(CmpEQ64(Or64(And64(V128HIto64(t134),t135),V128to64(t134)),t137))))) t139 = 0x7FFF000000000000:I64 t138 = 0xFFFFFFFFFFFF:I64 t140 = 0x0:I64 t127 = 32to1(And32(1Uto32(CmpEQ64(And64(V128HIto64(t134),t139),t139)),1Uto32(CmpEQ64(Or64(And64(V128HIto64(t134),t138),V128to64(t134)),t140)))) t142 = 0x7FFF000000000000:I64 t141 = 0xFFFFFFFFFFFF:I64 t143 = 0x0:I64 t132 = 32to1(And32(1Uto32(CmpEQ64(And64(V128HIto64(t134),t142),t143)),1Uto32(CmpEQ64(Or64(And64(V128HIto64(t134),t141),V128to64(t134)),t143)))) t144 = 0x7FFF000000000000:I64 t145 = 0x0:I64 t129 = 32to1(And32(1Uto32(Not1(CmpEQ64(And64(V128HIto64(t134),t144),t145))),1Uto32(Not1(CmpEQ64(And64(V128HIto64(t134),t144),t144))))) t147 = 0x7FFF000000000000:I64 t146 = 0xFFFFFFFFFFFF:I64 t148 = 0x0:I64 t128 = 32to1(And32(1Uto32(CmpEQ64(And64(V128HIto64(t134),t147),t148)),1Uto32(Not1(CmpEQ64(Or64(And64(V128HIto64(t134),t146),V128to64(t134)),t148))))) t130 = 32to1(And32(1Uto32(32to1(Not32(1Uto32(t133)))),1Uto32(1:I1))) t131 = 32to1(And32(1Uto32(t133),1Uto32(1:I1))) PUT(1344) = 32to8(Or32(And32(0xF:I32,8Uto32(GET:I8(1344))),Shl32(And32(0x1:I32,1Uto32(32to1(Or32(1Uto32(32to1(Or32(1Uto32(t126),1Uto32(32to1(And32(1Uto32(t131)\ ,1Uto32(t128))))))),1Uto32(32to1(Or32(1Uto32(32to1(And32(1Uto32(t131),1Uto32(t132)))),1Uto32(32to1(And32(1Uto32(t130),1Uto32(t128))))))))))),0x4:I8))) PUT(1344) = 32to8(Or32(And32(0x10:I32,8Uto32(GET:I8(1344))),And32(0xF:I32,Or32(Or32(1Uto32(32to1(Or32(1Uto32(t126),1Uto32(t127)))),Shl32(1Uto32(32to1(And32(1Ut\ o32(32to1(Not32(1Uto32(t126)))),1Uto32(t132)))),0x1:I8)),Or32(Shl32(1Uto32(32to1(And32(1Uto32(32to1(Not32(1Uto32(t126)))),1Uto32(32to1(And32(1Uto32(32to1(Or32(1Uto32(32to1(O\ r32(1Uto32(32to1(And32(1Uto32(t130),1Uto32(t128)))),1Uto32(32to1(And32(1Uto32(t130),1Uto32(t129))))))),1Uto32(32to1(And32(1Uto32(t130),1Uto32(t127))))))),1Uto32(32to1(And32(\ 1Uto32(32to1(Not32(1Uto32(t132)))),1Uto32(32to1(Not32(1Uto32(t126))))))))))))),0x2:I8),Shl32(1Uto32(32to1(And32(1Uto32(32to1(Not32(1Uto32(t126)))),1Uto32(32to1(And32(1Uto32(\ 32to1(Or32(1Uto32(32to1(Or32(1Uto32(32to1(And32(1Uto32(t131),1Uto32(t128)))),1Uto32(32to1(And32(1Uto32(t131),1Uto32(t129))))))),1Uto32(32to1(And32(1Uto32(t131),1Uto32(t127))\ ))))),1Uto32(32to1(And32(1Uto32(32to1(Not32(1Uto32(t132)))),1Uto32(32to1(Not32(1Uto32(t126))))))))))))),0x3:I8)))))) PUT(784) = t123 PUT(1296) = 0x4157DB0:I64 The basic block again seems to have about 30 instructions, with 6 of the instructions having the above expansion for generate_store_FPRF(). This with the added dres->hint = Dis_HintVerbose on each of these instructions. The generate_store_FPRF() stores a condition code, which for this application is not used so I commented out the body of the function to avoid calculating the code and storing it. Once it is removed, the workload runs normally. So, we either need to get the dres->hint to limit the BB more, perhaps end the BB once it sees an instruction with the hint or perhaps use a C-code handler in place of the generate_store_FPRF() function. Other thoughts?
(In reply to Carl Love from comment #1) > to the code for the xxperm instruction. I verified that the hint was being > The block has 1312 temporaries before instrumentation. With instrumentation > we have 4920 temporaries. > > So, even with the hint, there are too many instructions in the BB. Yes. A good solution would be to (drastically) reduce the length of this translation by building it around Iop_Perm8x8 instead. Have a look at math_PSHUFB_XMM in the amd64 front end for an example of how Iop_Perm8x8 is used 4 times to do what I think is the equivalent shuffle. xxperm has the added complexity that if an index is >= 16 then the value is taken instead from xT. From a quick scan of the sources I can't see whether, in this case result[i] = xT[i] or result[i] = xT[ xB[i] ] Assuming it's the first, one way to shorten that up is like this vec16s = [16, 16 .. 16] // 16 of these mask = cmpGeU16x8( xB, vec16s ) This gives you a mask which shows, for each lane whether the result will come from xT or from xA[ xB[i & 15] ]. Then you can do the permute operation in the style of math_PSHUFB_XMM, and at the end use the normal and-or-not masking using |mask| to copy the relevant bits in from xT instead.
(In reply to Carl Love from comment #2) > perhaps use > a C-code handler in place of the generate_store_FPRF() function. That's probably your best option. Alternatively (and as a general comment), generate_store_FPRF() and its sub functions (is_NaN, .., is_Denorm, create_FPCC, etc) use Ity_I1 to do boolean functions (and, or, not). I can see why you'd want to do that -- it's conceptually clean -- but it generates huge numbers of I1 <-> I32 conversions in the IR, of the form 1Uto32(32to1(And32(1Uto32(t130),1Uto32(t129)) Those are expensive in the back end, especially 1Uto32/1Uto64. It would be less verbose to re-do all of these functions to return I32s which are either 0 or 1. Then a lot of those conversions would disappear. One other comment .. I noticed in exponent_compare(), this: /* No support for 64-bit compares in 32-bit mode, need to do upper * and lower parts using 32-bit compare operators. */ There's no intrinsic reason why you can't support CmpEQ64 in 32-bit mode. The ppc back end already has routines to compute 64 bit values into a 32-bit register pair (iselInt64Expr) and you could then implement CmpEQ64/NE64 as the x86 (32-bit) backend does.
Created attachment 108050 [details] replace body of generate_store_FPRF with C helper function. Created C helper to replace the Iop code to generate the value of the floating point condition code FPCC and the C field. The workload runs without running out of temp space without hints.
Created attachment 108068 [details] replace body of generate_store_FPRF with C helper function. Found a couple of bugs in the previous version of the patch.
Created attachment 108069 [details] Use vperm Iop code with minor modifications to implement the xxperm and xxpermr instructions The xxperm instructions does the same permute as the vperm instruction. The difference is the xxperm instruction works on the VSX register file and hte vperm works on the VR register file. The vperm instruction has three input registers and the xxperm has two input registers and then uses the output register as an additional input register. Basically, you can use the vperm instruction to emulate the xxperm with some register renaming. The attached patch just takes the existing vperm Iop code and tweeks it to do the xxperm and xxpermr instructions. This only generates a few Iops rather then the massive amount of iops as the previous implementation.
Created attachment 108077 [details] Use vperm Iop code with minor modifications to implement the xxperm and xxpermr instructions Fixed a bug, a little patch cleanup.
The video user test case has additional issues with missing instruction support and the DSCR register not not supported. Work on this bug also included reworking the vpermr instruction implementation as it has the same potential issue as seen with the two user applications. I forgot about this bugzilla and opened a series of 5 bugzillas where each issue/patch has its own bugzilla. The above patches are obsolete. The new patches are in the following bugzillas. Bug 385182 - PPC64 is missing support for the DSCR Bug 385183 - PPC64 missing support for xscmpeqdp, xscmpgtdp, xscmpgedp, xsmincdp instructions Bug 385207 - PPC64, generate_store_FPRF() generates too many Iops Bug 385208 - xxperm instruction exhausts temporary memory Bug 385210 - vpermr instruction could exhaust temporary memory I will close this bugzilla when after the patches are committed and each of the above bugzillas are closed.
This bugzilla became the parent for a series of bugzillas for each of the issues found. The list of child bugs is: Bug 385182 - PPC64 is missing support for the DSCR Bug 385183 - PPC64 missing support for xscmpeqdp, xscmpgtdp, xscmpgedp, xsmincdp instructions Bug 385207 - PPC64, generate_store_FPRF() generates too many Iops Bug 385208 - xxperm instruction exhausts temporary memory Bug 385210 - vpermr instruction could exhaust temporary memory Bug 385334 - PPC64, fix vpermr, xxperm, xxpermr mask value These child bugs have all been fixed, patches submitted and closed. Closing this bugzilla.