Hi guys,
I've noticed that emscripten does really weird stuff with constants and SIMD. Particularly, if you look at the code here: you'll notice that it's splatting a lane, and then replacing each lane with appropriate values. It's crazy.
The sample code is below. The full description with commentary is on stackoverflow.
alignas(64) const static uint16_t multiplierArray[8]= {77,150,29,1,77,160,29,1};
extern "C"
int rgba2y(void* inputDataBuffer, ptrdiff_t length) {
typedef __u8x16 v8x16;
typedef __u16x8 v16x8;
v8x16* pInputPtr = (v8x16*) inputDataBuffer;
v8x16* pInputPtrEnd = (v8x16*)((uint8_t*)inputDataBuffer + length);
v8x16* pOutputPtr = (v8x16*) inputDataBuffer;
__m128i rounder = _mm_cvtsi32_si128(0x80808080);
v8x16 zero;
zero ^= zero;
__m128i multiplier = *((__m128i*)multiplierArray);
// v16x8 multiplier = wasm_i64x2_splat(0x1001d0096004d);
unsigned i = 0;
for (; (i+4)*sizeof(__m128i)<= length; i+= 4) {
v8x16 iv0 = wasm_v8x16_shuffle(pInputPtr[i/4],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv1 = wasm_v8x16_shuffle(pInputPtr[i/4+1],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv2 = wasm_v8x16_shuffle(pInputPtr[i/4+2],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv3 = wasm_v8x16_shuffle(pInputPtr[i/4+3],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
// rg ba rg ba rg ba rg ba rg ba rg ba rg ba
__m128i rg0 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv0, (__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv0,(__m128i)zero),(__m128i)multiplier));
__m128i rg1 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier));
__m128i rg2 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier));
__m128i rg3 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier));
// rgba rgba rgba rgba rgba rgba rgba rgba
__m128i rgba0 = wasm_u16x8_shr(_mm_hadd_epi16(rg0,rg1), 8);
__m128i rgba1 = wasm_u16x8_shr(_mm_hadd_epi16(rg2,rg3), 8);
pOutputPtr[i/4] = wasm_u8x16_narrow_i16x8(rgba0,rgba1);
}
// abbreviated...
return 0;
}
.section .text.rgba2y,"",@
.hidden rgba2y # -- Begin function rgba2y
.globl rgba2y
.type rgba2y,@function
rgba2y: # @rgba2y
.Lfunc_begin0:
.loc 2 56 0 # rgb2y-sample.cpp:56:0
.functype rgba2y (i32, i32) -> (i32)
.local i32, i32, v128, v128, v128, v128, v128, v128
# %bb.0: # %entry
#DEBUG_VALUE: rgba2y:length <- %4
#DEBUG_VALUE: rgba2y:pInputPtrEnd <- undef
#DEBUG_VALUE: rgba2y:i <- 0
#DEBUG_VALUE: rgba2y:inputDataBuffer <- %3
#DEBUG_VALUE: rgba2y:pInputPtr <- %3
#DEBUG_VALUE: rgba2y:pOutputPtr <- %3
#DEBUG_VALUE: rgba2y:rounder <- undef
#DEBUG_VALUE: rgba2y:zero <- undef
#DEBUG_VALUE: rgba2y:multiplier <- undef
block
.Ltmp0:
.loc 2 68 30 prologue_end # rgb2y-sample.cpp:68:30
local.get 1
i32.const 64
i32.lt_u
.Ltmp1:
.loc 2 68 2 is_stmt 0 # rgb2y-sample.cpp:68:2
br_if 0 # 0: down to label0
.Ltmp2:
# %bb.1:
.loc 2 0 2 # rgb2y-sample.cpp:0:2
i32.const 0
local.set 2
i32.const 4
local.set 3
.LBB0_2: # %for.body
# =>This Inner Loop Header: Depth=1
loop # label1:
.Ltmp3:
#DEBUG_VALUE: rgba2y:i <- %101
#DEBUG_VALUE: rgba0 <- undef
#DEBUG_VALUE: rgba1 <- undef
.loc 2 69 15 is_stmt 1 # rgb2y-sample.cpp:69:15
local.get 0
local.get 2
i32.const 2
i32.shl
i32.add
local.tee 2
local.get 2
v128.load 0
i32.const 0
i8x16.splat
local.tee 4
i32.const -128
i8x16.replace_lane 0
i32.const -128
i8x16.replace_lane 1
i32.const -128
i8x16.replace_lane 2
i32.const -128
i8x16.replace_lane 3
local.tee 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp4:
.loc 2 74 48 # rgb2y-sample.cpp:74:48
local.tee 6
.Ltmp5:
#DEBUG_VALUE: iv0 <- undef
#DEBUG_VALUE: iv0 <- %153
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
i32.const 77
.loc 2 74 32 is_stmt 0 # rgb2y-sample.cpp:74:32
i16x8.splat
i32.const 150
i16x8.replace_lane 1
i32.const 29
i16x8.replace_lane 2
i32.const 1
i16x8.replace_lane 3
i32.const 160
i16x8.replace_lane 5
i32.const 29
i16x8.replace_lane 6
i32.const 1
i16x8.replace_lane 7
local.tee 7
i16x8.mul
.loc 2 74 133 # rgb2y-sample.cpp:74:133
local.tee 8
local.get 6
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 74 117 # rgb2y-sample.cpp:74:117
local.get 7
i16x8.mul
.loc 2 74 17 # rgb2y-sample.cpp:74:17
local.tee 6
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 8
local.get 6
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp6:
.loc 2 0 17 # rgb2y-sample.cpp:0:17
local.tee 6
.Ltmp7:
#DEBUG_VALUE: rg0 <- undef
#DEBUG_VALUE: rg0 <- %153
.loc 2 70 15 is_stmt 1 # rgb2y-sample.cpp:70:15
local.get 2
i32.const 16
i32.add
v128.load 0
local.get 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp8:
.loc 2 75 62 # rgb2y-sample.cpp:75:62
local.tee 8
.Ltmp9:
#DEBUG_VALUE: iv1 <- undef
#DEBUG_VALUE: iv1 <- %157
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc 2 75 46 is_stmt 0 # rgb2y-sample.cpp:75:46
local.get 7
i16x8.mul
.loc 2 75 146 # rgb2y-sample.cpp:75:146
local.tee 9
local.get 8
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 75 130 # rgb2y-sample.cpp:75:130
local.get 7
i16x8.mul
.loc 2 75 31 # rgb2y-sample.cpp:75:31
local.tee 8
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 9
local.get 8
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp10:
.loc 2 79 33 is_stmt 1 # rgb2y-sample.cpp:79:33
local.tee 8
.Ltmp11:
#DEBUG_VALUE: rg1 <- undef
#DEBUG_VALUE: rg1 <- %157
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 6
local.get 8
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
i32.const 8
.loc 2 79 18 is_stmt 0 # rgb2y-sample.cpp:79:18
i16x8.shr_u
.loc 2 71 15 is_stmt 1 # rgb2y-sample.cpp:71:15
local.get 2
i32.const 32
i32.add
v128.load 0
local.get 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp12:
.loc 2 76 62 # rgb2y-sample.cpp:76:62
local.tee 6
.Ltmp13:
#DEBUG_VALUE: iv2 <- undef
#DEBUG_VALUE: iv2 <- %153
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc 2 76 46 is_stmt 0 # rgb2y-sample.cpp:76:46
local.get 7
i16x8.mul
.loc 2 76 146 # rgb2y-sample.cpp:76:146
local.tee 8
local.get 6
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 76 130 # rgb2y-sample.cpp:76:130
local.get 7
i16x8.mul
.loc 2 76 31 # rgb2y-sample.cpp:76:31
local.tee 6
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 8
local.get 6
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp14:
.loc 2 0 31 # rgb2y-sample.cpp:0:31
local.tee 6
.Ltmp15:
#DEBUG_VALUE: rg2 <- undef
#DEBUG_VALUE: rg2 <- %153
.loc 2 72 15 is_stmt 1 # rgb2y-sample.cpp:72:15
local.get 2
i32.const 48
i32.add
v128.load 0
local.get 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp16:
.loc 2 77 62 # rgb2y-sample.cpp:77:62
local.tee 5
.Ltmp17:
#DEBUG_VALUE: iv3 <- undef
#DEBUG_VALUE: iv3 <- %98
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc 2 77 46 is_stmt 0 # rgb2y-sample.cpp:77:46
local.get 7
i16x8.mul
.loc 2 77 146 # rgb2y-sample.cpp:77:146
local.tee 8
local.get 5
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 77 130 # rgb2y-sample.cpp:77:130
local.get 7
i16x8.mul
.loc 2 77 31 # rgb2y-sample.cpp:77:31
local.tee 4
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 8
local.get 4
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp18:
.loc 2 80 33 is_stmt 1 # rgb2y-sample.cpp:80:33
local.tee 4
.Ltmp19:
#DEBUG_VALUE: rg3 <- undef
#DEBUG_VALUE: rg3 <- %93
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 6
local.get 4
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
i32.const 8
.loc 2 80 18 is_stmt 0 # rgb2y-sample.cpp:80:18
i16x8.shr_u
.loc 2 81 21 is_stmt 1 # rgb2y-sample.cpp:81:21
i8x16.narrow_i16x8_u
.loc 2 81 19 is_stmt 0 # rgb2y-sample.cpp:81:19
v128.store 0
.Ltmp20:
#DEBUG_VALUE: rgba2y:i <- %170
.loc 2 0 19 # rgb2y-sample.cpp:0:19
local.get 3
local.tee 3
local.set 2
.Ltmp21:
.loc 2 68 11 is_stmt 1 # rgb2y-sample.cpp:68:11
local.get 3
i32.const 4
i32.add
local.tee 3
i32.const 4
.loc 2 68 14 is_stmt 0 # rgb2y-sample.cpp:68:14
i32.shl
.loc 2 68 30 # rgb2y-sample.cpp:68:30
local.get 1
i32.le_u
.Ltmp22:
.loc 2 68 2 # rgb2y-sample.cpp:68:2
br_if 0 # 0: up to label1
.Ltmp23:
.LBB0_3: # %for.end
end_loop
end_block # label0:
i32.const 0
.Ltmp24:
.loc 2 84 2 is_stmt 1 # rgb2y-sample.cpp:84:2
# fallthrough-return
end_function
.Ltmp25:
.Lfunc_end0:
.size rgba2y, .Lfunc_end0-rgba2y
md5-0fc10999e8a5de4fc0bc444109b7b851
--- WebAssembly code ---
index: 2
kind: wasm function
compiler: TurboFan
Body (size = 1088 = 1086 + 2 padding)
Instructions (size = 1064)
0xa5976359180 0 55 push rbp
0xa5976359181 1 4889e5 REX.W movq rbp,rsp
0xa5976359184 4 6a0a push 0xa
0xa5976359186 6 56 push rsi
0xa5976359187 7 4883ec58 REX.W subq rsp,0x58
0xa597635918b b 488b5e17 REX.W movq rbx,[rsi+0x17]
0xa597635918f f 83fa40 cmpl rdx,0x40
0xa5976359192 12 0f8307000000 jnc 0xa597635919f <+0x1f>
0xa5976359198 18 33c9 xorl rcx,rcx
0xa597635919a 1a e990030000 jmp 0xa597635952f <+0x3af>
0xa597635919f 1f b94d000000 movl rcx,0x4d
0xa59763591a4 24 c5f96ec1 vmovd xmm0,rcx
0xa59763591a8 28 c5fb70c000 vpshuflw xmm0,xmm0,0x0
0xa59763591ad 2d c5f970c000 vpshufd xmm0,xmm0,0x0
0xa59763591b2 32 33c9 xorl rcx,rcx
0xa59763591b4 34 c5f96ec9 vmovd xmm1,rcx
0xa59763591b8 38 c4410057ff vxorps xmm15,xmm15,xmm15
0xa59763591bd 3d c4c27100cf vpshufb xmm1,xmm1,xmm15
0xa59763591c2 42 bf96000000 movl rdi,0x96
0xa59763591c7 47 c5f9c4c701 vpinsrw xmm0,xmm0,rdi,0x1
0xa59763591cc 4c bf80ffffff movl rdi,0xffffff80
0xa59763591d1 51 c5f928d1 vmovapd xmm2,xmm1
0xa59763591d5 55 c4e36920d700 vpinsrb xmm2,xmm2,dil,0x0
0xa59763591db 5b 41b81d000000 movl r8,0x1d
0xa59763591e1 61 c4c179c4c002 vpinsrw xmm0,xmm0,r8,0x2
0xa59763591e7 67 c4e36920d701 vpinsrb xmm2,xmm2,dil,0x1
0xa59763591ed 6d 41b901000000 movl r9,0x1
0xa59763591f3 73 c4c179c4c103 vpinsrw xmm0,xmm0,r9,0x3
0xa59763591f9 79 c4e36920d702 vpinsrb xmm2,xmm2,dil,0x2
0xa59763591ff 7f 41bba0000000 movl r11,0xa0
0xa5976359205 85 c4c179c4c305 vpinsrw xmm0,xmm0,r11,0x5
0xa597635920b 8b c4e36920d703 vpinsrb xmm2,xmm2,dil,0x3
0xa5976359211 91 c4c179c4c006 vpinsrw xmm0,xmm0,r8,0x6
0xa5976359217 97 c4c179c4c107 vpinsrw xmm0,xmm0,r9,0x7
0xa597635921d 9d 488bf9 REX.W movq rdi,rcx
0xa5976359220 a0 41b804000000 movl r8,0x4
0xa5976359226 a6 e90b000000 jmp 0xa5976359236 <+0xb6>
0xa597635922b ab 0f1f440000 nop
0xa5976359230 b0 498bf8 REX.W movq rdi,r8
0xa5976359233 b3 4d8bc1 REX.W movq r8,r9
0xa5976359236 b6 4c8b4e2f REX.W movq r9,[rsi+0x2f]
0xa597635923a ba 493b21 REX.W cmpq rsp,[r9]
0xa597635923d bd 0f86f4020000 jna 0xa5976359537 <+0x3b7>
0xa5976359243 c3 458d4804 leal r9,[r8+0x4]
0xa5976359247 c7 4d8bd9 REX.W movq r11,r9
0xa597635924a ca 41c1e304 shll r11, 4
0xa597635924e ce 8d3cb8 leal rdi,[rax+rdi*4]
0xa5976359251 d1 c5fa6f1c3b vmovdqu xmm3,[rbx+rdi*1]
0xa5976359256 d6 c5fa6f641f10 vmovdqu xmm4,[rdi+rbx*1+0x10]
0xa597635925c dc c5fa6f6c1f20 vmovdqu xmm5,[rdi+rbx*1+0x20]
0xa5976359262 e2 c5fa6f741f30 vmovdqu xmm6,[rdi+rbx*1+0x30]
0xa5976359268 e8 c57810fe vmovups xmm15,xmm6
0xa597635926c ec 49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xa5976359276 f6 c441f96ec2 vmovq xmm8,r10
0xa597635927b fb 49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xa5976359285 105 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa597635928b 10b c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa5976359290 110 0f10fa movups xmm7,xmm2
0xa5976359293 113 49ba8080800080808000 REX.W movq r10,0x80808000808080
0xa597635929d 11d c441f96ec2 vmovq xmm8,r10
0xa59763592a2 122 4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xa59763592a9 129 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa59763592af 12f c4c24100f8 vpshufb xmm7,xmm7,xmm8
0xa59763592b4 134 c4c141ebff vpor xmm7,xmm7,xmm15
0xa59763592b9 139 c57810fd vmovups xmm15,xmm5
0xa59763592bd 13d 4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xa59763592c4 144 c441f96ec2 vmovq xmm8,r10
0xa59763592c9 149 4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xa59763592d0 150 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa59763592d6 156 c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa59763592db 15b 0f10f2 movups xmm6,xmm2
0xa59763592de 15e 4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xa59763592e5 165 c441f96ec2 vmovq xmm8,r10
0xa59763592ea 16a 4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xa59763592f1 171 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa59763592f7 177 c4c24900f0 vpshufb xmm6,xmm6,xmm8
0xa59763592fc 17c c4c149ebf7 vpor xmm6,xmm6,xmm15
0xa5976359301 181 c57810fc vmovups xmm15,xmm4
0xa5976359305 185 4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xa597635930c 18c c441f96ec2 vmovq xmm8,r10
0xa5976359311 191 4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xa5976359318 198 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa597635931e 19e c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa5976359323 1a3 0f10ea movups xmm5,xmm2
0xa5976359326 1a6 4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xa597635932d 1ad c441f96ec2 vmovq xmm8,r10
0xa5976359332 1b2 4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xa5976359339 1b9 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa597635933f 1bf c4c25100e8 vpshufb xmm5,xmm5,xmm8
0xa5976359344 1c4 c4c151ebef vpor xmm5,xmm5,xmm15
0xa5976359349 1c9 c57810fb vmovups xmm15,xmm3
0xa597635934d 1cd 4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xa5976359354 1d4 c441f96ec2 vmovq xmm8,r10
0xa5976359359 1d9 4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xa5976359360 1e0 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa5976359366 1e6 c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa597635936b 1eb 0f10e2 movups xmm4,xmm2
0xa597635936e 1ee 4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xa5976359375 1f5 c441f96ec2 vmovq xmm8,r10
0xa597635937a 1fa 4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xa5976359381 201 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa5976359387 207 c4c25900e0 vpshufb xmm4,xmm4,xmm8
0xa597635938c 20c c4c159ebe7 vpor xmm4,xmm4,xmm15
0xa5976359391 211 c5f928df vmovapd xmm3,xmm7
0xa5976359395 215 c5e168d9 vpunpckhbw xmm3,xmm3,xmm1
0xa5976359399 219 c5c160f9 vpunpcklbw xmm7,xmm7,xmm1
0xa597635939d 21d c57928c6 vmovapd xmm8,xmm6
0xa59763593a1 221 c53968c1 vpunpckhbw xmm8,xmm8,xmm1
0xa59763593a5 225 c5c960f1 vpunpcklbw xmm6,xmm6,xmm1
0xa59763593a9 229 c57928cd vmovapd xmm9,xmm5
0xa59763593ad 22d c53168c9 vpunpckhbw xmm9,xmm9,xmm1
0xa59763593b1 231 c5d160e9 vpunpcklbw xmm5,xmm5,xmm1
0xa59763593b5 235 c57928d4 vmovapd xmm10,xmm4
0xa59763593b9 239 c52968d1 vpunpckhbw xmm10,xmm10,xmm1
0xa59763593bd 23d c5d960e1 vpunpcklbw xmm4,xmm4,xmm1
0xa59763593c1 241 c5e1d5d8 vpmullw xmm3,xmm3,xmm0
0xa59763593c5 245 c5c1d5f8 vpmullw xmm7,xmm7,xmm0
0xa59763593c9 249 c539d5c0 vpmullw xmm8,xmm8,xmm0
0xa59763593cd 24d c5c9d5f0 vpmullw xmm6,xmm6,xmm0
0xa59763593d1 251 c531d5c8 vpmullw xmm9,xmm9,xmm0
0xa59763593d5 255 c5d1d5e8 vpmullw xmm5,xmm5,xmm0
0xa59763593d9 259 c529d5d0 vpmullw xmm10,xmm10,xmm0
0xa59763593dd 25d c5d9d5e0 vpmullw xmm4,xmm4,xmm0
0xa59763593e1 261 c57928df vmovapd xmm11,xmm7
0xa59763593e5 265 c44101efff vpxor xmm15,xmm15,xmm15
0xa59763593ea 26a c463010efb55 vpblendw xmm15,xmm15,xmm3,0x55
0xa59763593f0 270 c443210edfaa vpblendw xmm11,xmm11,xmm15,0xaa
0xa59763593f6 276 c442212bdf vpackusdw xmm11,xmm11,xmm15
0xa59763593fb 27b c57810fb vmovups xmm15,xmm3
0xa59763593ff 27f c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359405 285 c5c172d710 vpsrld xmm7,xmm7,16
0xa597635940a 28a c4c2412bff vpackusdw xmm7,xmm7,xmm15
0xa597635940f 28f c5f928de vmovapd xmm3,xmm6
0xa5976359413 293 c44101efff vpxor xmm15,xmm15,xmm15
0xa5976359418 298 c443010ef855 vpblendw xmm15,xmm15,xmm8,0x55
0xa597635941e 29e c4c3610edfaa vpblendw xmm3,xmm3,xmm15,0xaa
0xa5976359424 2a4 c4c2612bdf vpackusdw xmm3,xmm3,xmm15
0xa5976359429 2a9 c4417810f8 vmovups xmm15,xmm8
0xa597635942e 2ae c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359434 2b4 c5c972d610 vpsrld xmm6,xmm6,16
0xa5976359439 2b9 c4c2492bf7 vpackusdw xmm6,xmm6,xmm15
0xa597635943e 2be c57928c5 vmovapd xmm8,xmm5
0xa5976359442 2c2 c44101efff vpxor xmm15,xmm15,xmm15
0xa5976359447 2c7 c443010ef955 vpblendw xmm15,xmm15,xmm9,0x55
0xa597635944d 2cd c443390ec7aa vpblendw xmm8,xmm8,xmm15,0xaa
0xa5976359453 2d3 c442392bc7 vpackusdw xmm8,xmm8,xmm15
0xa5976359458 2d8 c4417810f9 vmovups xmm15,xmm9
0xa597635945d 2dd c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359463 2e3 c5d172d510 vpsrld xmm5,xmm5,16
0xa5976359468 2e8 c4c2512bef vpackusdw xmm5,xmm5,xmm15
0xa597635946d 2ed c57928cc vmovapd xmm9,xmm4
0xa5976359471 2f1 c44101efff vpxor xmm15,xmm15,xmm15
0xa5976359476 2f6 c443010efa55 vpblendw xmm15,xmm15,xmm10,0x55
0xa597635947c 2fc c443310ecfaa vpblendw xmm9,xmm9,xmm15,0xaa
0xa5976359482 302 c442312bcf vpackusdw xmm9,xmm9,xmm15
0xa5976359487 307 c4417810fa vmovups xmm15,xmm10
0xa597635948c 30c c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359492 312 c5d972d410 vpsrld xmm4,xmm4,16
0xa5976359497 317 c4c2592be7 vpackusdw xmm4,xmm4,xmm15
0xa597635949c 31c c4c141fdfb vpaddw xmm7,xmm7,xmm11
0xa59763594a1 321 c5c9fdf3 vpaddw xmm6,xmm6,xmm3
0xa59763594a5 325 c4c151fde8 vpaddw xmm5,xmm5,xmm8
0xa59763594aa 32a c4c159fde1 vpaddw xmm4,xmm4,xmm9
0xa59763594af 32f c5f928de vmovapd xmm3,xmm6
0xa59763594b3 333 c44101efff vpxor xmm15,xmm15,xmm15
0xa59763594b8 338 c463010eff55 vpblendw xmm15,xmm15,xmm7,0x55
0xa59763594be 33e c4c3610edfaa vpblendw xmm3,xmm3,xmm15,0xaa
0xa59763594c4 344 c4c2612bdf vpackusdw xmm3,xmm3,xmm15
0xa59763594c9 349 c57810ff vmovups xmm15,xmm7
0xa59763594cd 34d c4c10172d710 vpsrld xmm15,xmm15,16
0xa59763594d3 353 c5c972d610 vpsrld xmm6,xmm6,16
0xa59763594d8 358 c4c2492bf7 vpackusdw xmm6,xmm6,xmm15
0xa59763594dd 35d c5f928fc vmovapd xmm7,xmm4
0xa59763594e1 361 c44101efff vpxor xmm15,xmm15,xmm15
0xa59763594e6 366 c463010efd55 vpblendw xmm15,xmm15,xmm5,0x55
0xa59763594ec 36c c4c3410effaa vpblendw xmm7,xmm7,xmm15,0xaa
0xa59763594f2 372 c4c2412bff vpackusdw xmm7,xmm7,xmm15
0xa59763594f7 377 c57810fd vmovups xmm15,xmm5
0xa59763594fb 37b c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359501 381 c5d972d410 vpsrld xmm4,xmm4,16
0xa5976359506 386 c4c2592be7 vpackusdw xmm4,xmm4,xmm15
0xa597635950b 38b c5c9fdf3 vpaddw xmm6,xmm6,xmm3
0xa597635950f 38f c5d9fde7 vpaddw xmm4,xmm4,xmm7
0xa5976359513 393 c5c971d608 vpsrlw xmm6,xmm6,8
0xa5976359518 398 c5d971d408 vpsrlw xmm4,xmm4,8
0xa597635951d 39d c5d967e6 vpackuswb xmm4,xmm4,xmm6
0xa5976359521 3a1 c5fa7f243b vmovdqu [rbx+rdi*1],xmm4
0xa5976359526 3a6 443bda cmpl r11,rdx
0xa5976359529 3a9 0f8601fdffff jna 0xa5976359230 <+0xb0>
0xa597635952f 3af 488bc1 REX.W movq rax,rcx
0xa5976359532 3b2 488be5 REX.W movq rsp,rbp
0xa5976359535 3b5 5d pop rbp
0xa5976359536 3b6 c3 retl
0xa5976359537 3b7 488955e8 REX.W movq [rbp-0x18],rdx
0xa597635953b 3bb 48895de0 REX.W movq [rbp-0x20],rbx
0xa597635953f 3bf c5f81145d0 vmovups [rbp-0x30],xmm0
0xa5976359544 3c4 c5f8114dc0 vmovups [rbp-0x40],xmm1
0xa5976359549 3c9 c5f81155b0 vmovups [rbp-0x50],xmm2
0xa597635954e 3ce 488945a8 REX.W movq [rbp-0x58],rax
0xa5976359552 3d2 48897da0 REX.W movq [rbp-0x60],rdi
0xa5976359556 3d6 4c894598 REX.W movq [rbp-0x68],r8
0xa597635955a 3da e8615dffff call 0xa597634f2c0 ;; wasm stub: WasmStackGuard
0xa597635955f 3df 33c9 xorl rcx,rcx
0xa5976359561 3e1 488b55e8 REX.W movq rdx,[rbp-0x18]
0xa5976359565 3e5 488b5de0 REX.W movq rbx,[rbp-0x20]
0xa5976359569 3e9 c5f81045d0 vmovups xmm0,[rbp-0x30]
0xa597635956e 3ee c5f8104dc0 vmovups xmm1,[rbp-0x40]
0xa5976359573 3f3 c5f81055b0 vmovups xmm2,[rbp-0x50]
0xa5976359578 3f8 488b45a8 REX.W movq rax,[rbp-0x58]
0xa597635957c 3fc 488b7da0 REX.W movq rdi,[rbp-0x60]
0xa5976359580 400 4c8b4598 REX.W movq r8,[rbp-0x68]
0xa5976359584 404 488b75f0 REX.W movq rsi,[rbp-0x10]
0xa5976359588 408 e9b6fcffff jmp 0xa5976359243 <+0xc3>
0xa597635958d 40d e8fe5affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359592 412 e8f95affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359597 417 e8f45affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa597635959c 41c e8ef5affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a1 421 e8ea5affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a6 426 90 nop
0xa59763595a7 427 90 nop
Protected instructions:
pc offset land pad
3a1 40d
e2 412
dc 417
d6 41c
d1 421
Source positions:
pc offset position
d1 43
d6 239
dc 416
e2 545
3a1 722
3b7 29
40d 722
412 545
417 416
41c 239
421 43
Safepoints (size = 22)
0xa5a7635917fffffffff 000000000000000 (sp -> fp)
RelocInfo (size = 8)
0xa597635955b wasm stub call
0xa597635958e wasm stub call
0xa5976359593 wasm stub call
0xa5976359598 wasm stub call
0xa597635959d wasm stub call
0xa59763595a2 wasm stub call
--- End code ---
The reason we don't use v128.const for this is that v128.const was only recently implemented in V8. To avoid breaking origin trial users, we can't update LLVM to emit v128.const until the relevant V8 patches roll into Chrome stable. I'm keeping an eye on this dashboard to determine when will be a good time to make this change. If you're using a more recent build of Chrome or some other execution environment that does support v128.const, you can try compiling your project with the -munimplemented-simd128 flag, which will enable v128.const in LLVM (but might also introduce other changes that you don't want). Once v128.const is widely available, it will be better for LLVM to use v128.const than to load vectors from memory because that allows the engine to determine the best way to materialize vectors given the runtime platform.
It also might be worth considering porting performance-sensitive parts of your code to use the WebAssembly intrinsics header directly rather than relying on emulated SSE. That would reduce a layer of impedence mismatch between your code and the underlying machine code.
Finally, if you notice suboptimal instruction selection anywhere, it would be helpful if you could file LLVM bugs (if it's on the code -> wasm side) or V8 bugs (if it's on the wasm -> native side) about the specific issues you see. That kind of feedback is extremely valuable to us.
Hey Thomas,
Thanks for the quick response.
That sounds like an ideal solution -- but you must have a solution that exists for handling constant string data without v128.const. Why not use that?
Also, I'm totally fine with use direct wasm intrinsics -- except there is no hadd in the WASM SIMD proposal (even though it's pretty well supported across architectures). Right now it's two shuffles and an add -- which is fine I think.
Also, it's very easy for me to control the alignment of the data. Is there a way for me to instruct wasm that the data is aligned?
Dan
That sounds like an ideal solution -- but you must have a solution that exists for handling constant string data without v128.const. Why not use that?
It certainly would have been more optimal to use loads in the interim, but I wrote the code to use the simplest possible fallback rather than the optimal fallback since I knew that v128.const would be available before SIMD was standardized. If I had known how long it would be unavailable, I might have implemented constant loads instead, but it's probably not worth fixing now for just the remaining time before v128.const hits Chrome stable.
Also, it's very easy for me to control the alignment of the data. Is there a way for me to instruct wasm that the data is aligned?
When you load or store from a v128_t*, LLVM will assume that the access is aligned unless you tell it otherwise with an attribute. wasm_v128_load and other intrinsics from wasm_simd128.h, on the other hand, do not assume aligned accesses. Right now the only way to control the alignment hint on the other load instructions is to call the underlying target-specific builtins directly, which I don't recommend because they're not meant to be a stable interface.
That sounds like an ideal solution -- but you must have a solution that exists for handling constant string data without v128.const. Why not use that?
It certainly would have been more optimal to use loads in the interim, but I wrote the code to use the simplest possible fallback rather than the optimal fallback since I knew that v128.const would be available before SIMD was standardized. If I had known how long it would be unavailable, I might have implemented constant loads instead, but it's probably not worth fixing now for just the remaining time before v128.const hits Chrome stable.
I'm not sure I agree. You're worried about Chrome. I'm worried about Node. Unless something I don't know about happens, it looks like there will be a 0% chance that v128.const will make its support in to the next LTS release of Node (14). That means it could be years before we get that functionality.
Hmm, that's unfortunate. I'm not sure it's too different from the situation with other staging features, though, where if it's not stable and shipped when the LTS is cut, it probably shouldn't be used with the LTS. When does the LTS get cut, though? If it's not too soon, it might not be too difficult for me to slip this change into an Emscripten release before then. If we did that, you would still be stuck on the last version of Emscripten before the change to use v128.const by default for the lifetime of that Node LTS, though. I'm not sure that's any better than just considering this LTS to not support SIMD.
I'm pretty sure Node 14 LTS comes out later this year. It supports all SIMD features except the const...
LTS date: 2020-10-27
Also, it's very easy for me to control the alignment of the data. Is there a way for me to instruct wasm that the data is aligned?
When you load or store from a
v128_t*, LLVM will assume that the access is aligned unless you tell it otherwise with an attribute.wasm_v128_loadand other intrinsics from wasm_simd128.h, on the other hand, do not assume aligned accesses. Right now the only way to control the alignment hint on the other load instructions is to call the underlying target-specific builtins directly, which I don't recommend because they're not meant to be a stable interface.
In this case, it appears that v8 in my version of NodeJS is treating everything as unaligned. :-(
In this case, it appears that v8 in my version of NodeJS is treating everything as unaligned. :-(
In WebAssembly, alignment hints are only hints, so engines cannot depend on them to be correct. One implementation strategy is to install trap handlers to catch and work around alignment faults, but V8 uses the other strategy, which is to ignore the alignment hints and assume everything is unaligned.
Just to summarize... I'm asking for this help because it looks like using WebAssembly SIMD with Node JS 14 could provide a more portable solution for the libraries I'm maintaining than a custom C++ library. Personally, I have no intended browser cases, however, hey, why should I cut support if the performance is equivalent?
@tlively I'm looking at the link you left on stackoverflow -- https://github.com/llvm/llvm-project/blob/c193a689b475f91e63adb25dc5855f7a7f068c9a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp#L1451-L1610
Just for grins -- which branch/tag/repo do I need to pull from to create the pull request to fix this?
Emscripten uses tip of tree LLVM, so you can just clone https://github.com/llvm/llvm-project/tree/master/llvm. Unfortunately LLVM doesn't accept GitHub PRs, but if you want to send me any patch you come up with, I can take care of getting it reviewed and landed :)
Yay... Okay... I guess I first need to figure out how to create a read only global. Then I need to figure out a way to build and test it. Do you have a pointer to a guide on getting set up for this with emscripten?
I don't think there's a guide for LLVM development with Emscripten, but thankfully it's not too complicated. If you install emscripten via emsdk, it will automatically set up Emscripten itself as well as LLVM and Binaryen. There will be a .emscripten config file in your emsdk directory that you can edit to point to your custom build of LLVM, and everything should just work normally at that point.
The emscripten developer's guide may be helpful: https://emscripten.org/docs/contributing/developers_guide.html
This is really hard without any background in the internals of llvm. Do you happen to know where I can look for how it creates string constants and puts them in rodata?
String constants are a little different because they start out as global variables in llvm IR, so they're placed into memory via the normal data symbol mechanisms. I believe what you want here is to call DAG.getTargetConstantPool() in lowerBUILD_VECTOR to create a constant pool from which the v128 constant will be loaded. Looking into this a little more, it looks like the WebAssembly backend doesn't use constant pools for anything yet, so lowering them correctly would probably be a fair amount of extra work. I'm not actually sure what all the issues to resolve would be :(
Backing up a bit, are you constrained to using Node LTS or could you use a non-LTS version once that includes the fully standardized SIMD implementation?
I'm pretty constrained on LTS.
However, there's an alternative way to implement this without creating a constant pool if I can create a smart splat. It's not as efficient as a constant load, but it's significantly better than the current version.
Is the maximum number of operands for this operation limited to 16?
The code already chooses what value to splat such that the number of replace_lanes is minimized. Do you have an idea for how to improve on that? Yes, vectors can have at most 16 lanes.
The code already chooses what value to splat such that the number of replace_lanes is minimized. Do you have an idea for how to improve on that? Yes, vectors can have at most 16 lanes.
Yes. Any constant of 128 bits can be represented by two 64 bit values. Thus 1 load and 1 insert in the worst case. In the event of the two 64 but values matching, then it can be performed with exactly 1 splat. Neither are as efficient as loading from a constant but both are much better.
Oh right, that is a good idea! And it shouldn't require much interaction with LLVM APIs beyond what's already in that function.
Do we know the data type coming in from LLVM? Is it already lexed?--
This solution should work for all integer types and any 2 doubles. It's debatable about whether or not it could work for floats -- because even if we pack 64 bits of float data into a double, it's not really a double at that point. We _could_ convert them to integers representative of IEEE754 floats and pack them as integers, but it's not quite the right way to do it.
Yes, we already know the number and type of the lanes and we can tell if a lane is constant. Any logic to combine replace_lanes should go here. I would ignore floating point types for now and only try combining lanes for integers.
@tlively I went a different way of implementation that I thought would be a bit more suitable -- code below, but don't understand how to handle this particular error.
else if (NumConstantLanes >= NumSplatLanes &&
Subtarget->hasUnimplementedSIMD128()) {
SmallVector<SDValue, 16> ConstLanes;
for (const SDValue &Lane : Op->op_values()) {
if (IsConstant(Lane)) {
ConstLanes.push_back(Lane);
} else if (LaneT.isFloatingPoint()) {
ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
} else {
ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
}
}
Result = DAG.getBuildVector(VecT, DL, ConstLanes);
IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
return IsConstant(Lane);
};
} else if (NumConstantLanes >= NumSplatLanes && (VecT == MVT::v16i8 || VecT == MVT::v8i16 || VecT == MVT::v4i32 || VecT == MVT::v2i64)) {
std::array<uint64_t, 2> valuearray({0, 0});
unsigned i = 0;
size_t byteStep = VecT.getScalarType().getStoreSize().getFixedSize();
for (const SDValue &Lane : Op->op_values()) {
if (IsConstant(Lane)) {
using llvm::support::endian::byte_swap;
using llvm::support::little;
uint8_t* pByteArray = reinterpret_cast<uint8_t*>(valuearray.data());
// endianness of the compiler matters here.
// little endian has least significant bit first which is advantageous for this.
// so a 16 bit integer will be front loaded in a uint64 from byte packing perspective.
ConstantSDNode *pConstantSDNode = cast<ConstantSDNode>(Lane.getNode());
uint64_t val = byte_swap(pConstantSDNode->getLimitedValue(), little);
uint8_t* pVal = reinterpret_cast<uint8_t*>(&val);
std::copy(pVal,pVal+byteStep,pByteArray+i*byteStep);
}
++i;
}
Result = DAG.getSplatBuildVector(MVT::v2i64, DL, DAG.getConstant(valuearray[0], DL, MVT::i64));
if (valuearray[0] != valuearray[1]) {
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result, DAG.getConstant(valuearray[1], DL, MVT::i64),
DAG.getConstant(1, DL, MVT::i32));
}
IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
return IsConstant(Lane);
};
}
Error Message:
fatal error: error in backend: Cannot select: t442: v16i8 = WebAssemblyISD::SHUFFLE t390, t503, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, ./simdpp/detail/insn/permute4.h:100:12 @[ ./simdpp/core/permute4.h:106:12 @[ rgb2y-sample.cpp:182:26 ] ]
t390: v16i8 = bitcast t389, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/core/permute4.h:105:22 @[ rgb2y-sample.cpp:182:26 ] ]
t389: v4i32 = add t387, t388, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:181:12 ] ] ] ] ]
t387: v4i32 = add t383, t386, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:180:12 ] ] ] ] ]
t383: v4i32 = add t323, t382, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ]
t323: v4i32 = mul nuw nsw t419, t420, ./simdpp/detail/insn/i_mul_lo.h:69:12 @[ ./simdpp/detail/expr/i_mul.h:29:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/generic.h:229:52 @[ rgb2y-sample.cpp:157:41 ] ] ] ] ]
t419: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t428, rgb2y-sample.cpp:154:41
t428: v8i16 = WebAssemblyISD::WIDEN_HIGH_U t17, rgb2y-sample.cpp:120:68
t17: v16i8,ch = load<(load 16 from %ir.23, !tbaa !1934)> t0, t16, undef:i32, ./simdpp/detail/insn/load.h:34:9 @[ ./simdpp/detail/insn/load.h:182:5 @[ ./simdpp/detail/insn/load.h:191:9 @[ ./simdpp/detail/construct_eval.h:46:5 @[ ./simdpp/types/int8x16.h:150:9 @[ rgb2y-sample.cpp:38:31 ] ] ] ] ]
t16: i32 = add t15, t9, rgb2y-sample.cpp:38:67
t12: i32 = undef
t420: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t430, rgb2y-sample.cpp:153:42
t430: v8i16 = WebAssemblyISD::WIDEN_HIGH_U t13, rgb2y-sample.cpp:119:69
t13: v16i8,ch = load<(load 16 from %ir.22, !tbaa !1934)> t0, t10, undef:i32, ./simdpp/detail/insn/load.h:34:9 @[ ./simdpp/detail/insn/load.h:182:5 @[ ./simdpp/detail/insn/load.h:191:9 @[ ./simdpp/detail/construct_eval.h:46:5 @[ ./simdpp/types/int8x16.h:150:9 @[ rgb2y-sample.cpp:37:32 ] ] ] ] ]
t10: i32 = add t7, t9, rgb2y-sample.cpp:37:68
t12: i32 = undef
t382: v4i32 = bitcast t484, ./simdpp/types/empty_expr.h:207:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ] ]
t484: v16i8 = WebAssemblyISD::SHUFFLE t503, t380, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<16>, Constant:i32<17>, Constant:i32<18>, Constant:i32<19>, Constant:i32<20>, Constant:i32<21>, Constant:i32<22>, Constant:i32<23>, Constant:i32<24>, Constant:i32<25>, Constant:i32<26>, Constant:i32<27>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:179:22 ] ] ] ]
t503: v2i64 = BUILD_VECTOR Constant:i64<0>, Constant:i64<0>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:58:20 ] ] ] ]
t502: i64 = Constant<0>
t502: i64 = Constant<0>
t380: v16i8 = bitcast t323, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/core/move_r.h:105:22 @[ rgb2y-sample.cpp:179:22 ] ]
t323: v4i32 = mul nuw nsw t419, t420, ./simdpp/detail/insn/i_mul_lo.h:69:12 @[ ./simdpp/detail/expr/i_mul.h:29:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/generic.h:229:52 @[ rgb2y-sample.cpp:157:41 ] ] ] ] ]
t419: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t428, rgb2y-sample.cpp:154:41
t420: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t430, rgb2y-sample.cpp:153:42
t37: i32 = Constant<12>
t38: i32 = Constant<13>
t39: i32 = Constant<14>
t40: i32 = Constant<15>
t41: i32 = Constant<16>
t42: i32 = Constant<17>
t43: i32 = Constant<18>
t44: i32 = Constant<19>
t45: i32 = Constant<20>
t46: i32 = Constant<21>
t47: i32 = Constant<22>
t48: i32 = Constant<23>
t49: i32 = Constant<24>
t50: i32 = Constant<25>
t51: i32 = Constant<26>
t52: i32 = Constant<27>
t386: v4i32 = bitcast t464, ./simdpp/types/empty_expr.h:207:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:180:12 ] ] ] ] ] ]
t464: v16i8 = WebAssemblyISD::SHUFFLE t503, t384, Constant:i32<8>, Constant:i32<9>, Constant:i32<10>, Constant:i32<11>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<16>, Constant:i32<17>, Constant:i32<18>, Constant:i32<19>, Constant:i32<20>, Constant:i32<21>, Constant:i32<22>, Constant:i32<23>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:180:22 ] ] ] ]
t503: v2i64 = BUILD_VECTOR Constant:i64<0>, Constant:i64<0>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:58:20 ] ] ] ]
t502: i64 = Constant<0>
t502: i64 = Constant<0>
t384: v16i8 = bitcast t383, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/core/move_r.h:105:22 @[ rgb2y-sample.cpp:180:22 ] ]
t383: v4i32 = add t323, t382, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ]
t323: v4i32 = mul nuw nsw t419, t420, ./simdpp/detail/insn/i_mul_lo.h:69:12 @[ ./simdpp/detail/expr/i_mul.h:29:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/generic.h:229:52 @[ rgb2y-sample.cpp:157:41 ] ] ] ] ]
t419: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t428, rgb2y-sample.cpp:154:41
t420: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t430, rgb2y-sample.cpp:153:42
t382: v4i32 = bitcast t484, ./simdpp/types/empty_expr.h:207:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ] ]
t484: v16i8 = WebAssemblyISD::SHUFFLE t503, t380, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<16>, Constant:i32<17>, Constant:i32<18>, Constant:i32<19>, Constant:i32<20>, Constant:i32<21>, Constant:i32<22>, Constant:i32<23>, Constant:i32<24>, Constant:i32<25>, Constant:i32<26>, Constant:i32<27>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:179:22 ] ] ] ]
t57: i32 = Constant<8>
t58: i32 = Constant<9>
t59: i32 = Constant<10>
t60: i32 = Constant<11>
t37: i32 = Constant<12>
t38: i32 = Constant<13>
t39: i32 = Constant<14>
t40: i32 = Constant<15>
t41: i32 = Constant<16>
t42: i32 = Constant<17>
t43: i32 = Constant<18>
t44: i32 = Constant<19>
t45: i32 = Constant<20>
t46: i32 = Constant<21>
t47: i32 = Constant<22>
t48: i32 = Constant<23>
t388: v4i32,ch = load<(dereferenceable load 16 from %ir.17)> t379, TargetFrameIndex:i32<4>, undef:i32, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:181:12 ] ] ] ] ] ]
t508: i32 = TargetFrameIndex<4>
t12: i32 = undef
t503: v2i64 = BUILD_VECTOR Constant:i64<0>, Constant:i64<0>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:58:20 ] ] ] ]
t502: i64 = Constant<0>
t502: i64 = Constant<0>
t37: i32 = Constant<12>
t38: i32 = Constant<13>
t39: i32 = Constant<14>
t40: i32 = Constant<15>
t37: i32 = Constant<12>
t38: i32 = Constant<13>
t39: i32 = Constant<14>
t40: i32 = Constant<15>
t37: i32 = Constant<12>
t38: i32 = Constant<13>
t39: i32 = Constant<14>
t40: i32 = Constant<15>
t37: i32 = Constant<12>
t38: i32 = Constant<13>
t39: i32 = Constant<14>
t40: i32 = Constant<15>
In function: computeSumMatrixForwardSimd2PassAll
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0. Program arguments: /home/dan/git/llvm-project/llvm/build/bin/clang++ -target wasm32-unknown-emscripten -D__EMSCRIPTEN_major__=2 -D__EMSCRIPTEN_minor__=0 -D__EMSCRIPTEN_tiny__=4 -D_LIBCPP_ABI_VERSION=2 -Dunix -D__unix -D__unix__ -Werror=implicit-function-declaration -Xclang -nostdsysteminc -D__SSE__=1 -D__SSE2__=1 -D__SSE3__=1 -D__SSSE3__=1 -D__SSE4_1__=1 -D__SSE4_2__=1 -D__AVX__=1 -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/libcxx -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/libcxxabi/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/compat -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/libc -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/libc/musl/arch/emscripten -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/local/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/SSE -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/neon -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/compiler-rt/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/libunwind/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/cache/wasm/include -DEMSCRIPTEN -fignore-exceptions -g -O3 -msimd128 -I. -S rgb2y-sample.cpp -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/SDL -c -o rgb2y-sample.o -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr
1. <eof> parser at end of file
2. Code generation
3. Running pass 'Function Pass Manager' on module 'rgb2y-sample.cpp'.
4. Running pass 'WebAssembly Instruction Selection' on function '@computeSumMatrixForwardSimd2PassAll'
#0 0x0000556d8349b510 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1ea0510)
#1 0x0000556d83499284 llvm::sys::RunSignalHandlers() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e9e284)
#2 0x0000556d83499501 llvm::sys::CleanupOnSignal(unsigned long) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e9e501)
#3 0x0000556d83404e83 llvm::CrashRecoveryContext::HandleExit(int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e09e83)
#4 0x0000556d83491aab llvm::sys::Process::Exit(int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e96aab)
#5 0x0000556d82175f01 LLVMErrorHandler(void*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb7af01)
#6 0x0000556d8340c1ac llvm::report_fatal_error(llvm::Twine const&, bool) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e111ac)
#7 0x0000556d8340c324 (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e11324)
#8 0x0000556d843232aa llvm::SelectionDAGISel::CannotYetSelect(llvm::SDNode*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d282aa)
#9 0x0000556d84324702 llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d29702)
#10 0x0000556d821cf10b (anonymous namespace)::WebAssemblyDAGToDAGISel::Select(llvm::SDNode*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xbd410b)
#11 0x0000556d843210bb llvm::SelectionDAGISel::DoInstructionSelection() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d260bb)
#12 0x0000556d84329f29 llvm::SelectionDAGISel::CodeGenAndEmitDAG() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d2ef29)
#13 0x0000556d8433004b llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d3504b)
#14 0x0000556d8433195d llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (.part.0) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d3695d)
#15 0x0000556d828f6e49 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x12fbe49)
#16 0x0000556d82db4d71 llvm::FPPassManager::runOnFunction(llvm::Function&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x17b9d71)
#17 0x0000556d82db5469 llvm::FPPassManager::runOnModule(llvm::Module&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x17ba469)
#18 0x0000556d82db3f0c llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x17b8f0c)
#19 0x0000556d8375e284 (anonymous namespace)::EmitAssemblyHelper::EmitAssembly(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2163284)
#20 0x0000556d8375fc2d clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2164c2d)
#21 0x0000556d84454159 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2e59159)
#22 0x0000556d85323f89 clang::ParseAST(clang::Sema&, bool, bool) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x3d28f89)
#23 0x0000556d84452b88 clang::CodeGenAction::ExecuteAction() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2e57b88)
#24 0x0000556d83d97599 clang::FrontendAction::Execute() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x279c599)
#25 0x0000556d83d4dcc6 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2752cc6)
#26 0x0000556d83e6bb40 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2870b40)
#27 0x0000556d82176c2f cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb7bc2f)
#28 0x0000556d82173f98 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb78f98)
#29 0x0000556d83c0b639 void llvm::function_ref<void ()>::callback_fn<clang::driver::CC1Command::Execute(llvm::ArrayRef<llvm::Optional<llvm::StringRef> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, bool*) const::'lambda'()>(long) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2610639)
#30 0x0000556d83404d4c llvm::CrashRecoveryContext::RunSafely(llvm::function_ref<void ()>) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e09d4c)
#31 0x0000556d83c0bf56 clang::driver::CC1Command::Execute(llvm::ArrayRef<llvm::Optional<llvm::StringRef> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, bool*) const (.part.0) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2610f56)
#32 0x0000556d83be2a9c clang::driver::Compilation::ExecuteCommand(clang::driver::Command const&, clang::driver::Command const*&) const (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x25e7a9c)
#33 0x0000556d83be33d6 clang::driver::Compilation::ExecuteJobs(clang::driver::JobList const&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*> >&) const (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x25e83d6)
#34 0x0000556d83becb49 clang::driver::Driver::ExecuteCompilation(clang::driver::Compilation&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*> >&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x25f1b49)
#35 0x0000556d820fb0e0 main (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb000e0)
#36 0x00007fc609ec90b3 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x270b3)
#37 0x0000556d82173aee _start (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb78aee)
clang-12: error: clang frontend command failed with exit code 70 (use -v to see invocation)
clang version 12.0.0 ([email protected]:llvm/llvm-project d6ac649ccda289ecc2d2c0cb51892d57e8ec328c)
Target: wasm32-unknown-emscripten
Thread model: posix
InstalledDir: /home/dan/git/llvm-project/llvm/build/bin
clang-12: note: diagnostic msg:
********************
Got it. Needed a bitcast to the original vector type before returning. Will send a patch once I'm done testing it.
Patch is below. It's not a substitute for rodata, but it's a big improvement.
From 6d7f93a00a83636c54145c40c674b544d508f815 Mon Sep 17 00:00:00 2001
From: Dan Weber <[email protected]>
Date: Tue, 29 Sep 2020 17:13:02 +0000
Subject: [PATCH] fix(constants generation): Use integers to load complex
contants without replace_lane
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 29 +++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 425f8b86c..6dec5e456 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -31,6 +31,7 @@
#include "llvm/IR/IntrinsicsWebAssembly.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Endian.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
@@ -1579,6 +1580,34 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
return IsConstant(Lane);
};
+ } else if (NumConstantLanes >= NumSplatLanes && (VecT == MVT::v16i8 || VecT == MVT::v8i16 || VecT == MVT::v4i32 || VecT == MVT::v2i64)) {
+ std::array<uint64_t, 2> valuearray({0, 0});
+ unsigned i = 0;
+ size_t byteStep = VecT.getScalarType().getStoreSize().getFixedSize();
+ for (const SDValue &Lane : Op->op_values()) {
+ if (IsConstant(Lane)) {
+ using llvm::support::endian::byte_swap;
+ using llvm::support::little;
+ uint8_t* pByteArray = reinterpret_cast<uint8_t*>(valuearray.data());
+ // endianness of the compiler matters here.
+ // little endian has least significant bit first which is advantageous for this.
+ // so a 16 bit integer will be front loaded in a uint64 from byte packing perspective.
+ ConstantSDNode *pConstantSDNode = cast<ConstantSDNode>(Lane.getNode());
+ uint64_t val = byte_swap(pConstantSDNode->getLimitedValue(), little);
+ uint8_t* pVal = reinterpret_cast<uint8_t*>(&val);
+ std::copy(pVal,pVal+byteStep,pByteArray+i*byteStep);
+ }
+ ++i;
+ }
+ Result = DAG.getSplatBuildVector(MVT::v2i64, DL, DAG.getConstant(valuearray[0], DL, MVT::i64));
+ if (valuearray[0] != valuearray[1]) {
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result, DAG.getConstant(valuearray[1], DL, MVT::i64),
+ DAG.getConstant(1, DL, MVT::i32));
+ }
+ Result = DAG.getBitcast(VecT, Result);
+ IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+ return IsConstant(Lane);
+ };
}
if (!Result) {
// Use a splat, but possibly a load_splat
--
2.25.
Awesome, this is a nice simple approach. I will update the code to conform to LLVM style, update the corresponding tests, and get this merged. Thanks for working on this!
Patch up for review here: https://reviews.llvm.org/D88591. While writing tests, I nerd-sniped myself into improving it to only emit a follow-up i64x2.replace_lane after the splat if necessary.
When is a case when CombinedSufficient applies?
For example this test:
define <4 x i32> @emulated_const_combined_sufficient() {
ret <4 x i32> <i32 1, i32 undef, i32 undef, i32 2>
}
The first half will have the same value as {1, 0} and the second half will have the same value as {0, 2}, so neither half is sufficient to cover the other half naively. But if we take advantage of the fact that undef (or more generally, any lane that is unused in the initial constant) is allowed to take on any value, then we see that we can splat {1, 2} to cover both halves.
Okay... so from the user's perspective, those 2 middle lanes are undefined, but we've populated them with the other's value.
How does one do that from emscripten/em++?
These C functions both end up triggering that case, the first literally with undefs in the LLVM IR and the second because the middle lanes are not constant:
v128_t undef_middle() {
v128_t v;
v = wasm_i32x4_replace_lane(v, 0, 1);
v = wasm_i32x4_replace_lane(v, 3, 2);
return v;
}
v128_t nonconst_middle(int x, int y) {
return wasm_i32x4_make(1, x, y, 2);
}
Wow. Fascinating. Kudos.
Do you want to adjust the v128.const code to have that behavior too? Now it's filling those spots with zeros.
How are you thinking of changing the v128.const behavior? I don't think it should matter what non-constant lanes are filled with when emiting a v128.const.
If you want it to match, shouldn't you fill those spots with undefined values? Is that acceptable per the proposal?
WebAssembly doesn't have a concept of an undefined values, so LLVM has to choose some arbitrary value to fill in for undef when emitting WebAssembly. Usually 0 is a reasonable default, so that's what the v128.const code uses (and also what this new code uses). The only reason to use some other value is if doing so has additional benefits, such as reducing the number of instructions emitted like in the CombinedSufficient case in the new code.
Most helpful comment
Just to summarize... I'm asking for this help because it looks like using WebAssembly SIMD with Node JS 14 could provide a more portable solution for the libraries I'm maintaining than a custom C++ library. Personally, I have no intended browser cases, however, hey, why should I cut support if the performance is equivalent?