Emscripten: [SIMD] Emscripten uses replace_lane instead of loading constants from .rodata

Created on 15 Sep 2020  路  37Comments  路  Source: emscripten-core/emscripten

Hi guys,

I've noticed that emscripten does really weird stuff with constants and SIMD. Particularly, if you look at the code here: you'll notice that it's splatting a lane, and then replacing each lane with appropriate values. It's crazy.

The sample code is below. The full description with commentary is on stackoverflow.

alignas(64) const static uint16_t multiplierArray[8]= {77,150,29,1,77,160,29,1};
extern "C"
int rgba2y(void* inputDataBuffer, ptrdiff_t length) {
        typedef __u8x16 v8x16;
        typedef __u16x8 v16x8;
        v8x16* pInputPtr = (v8x16*) inputDataBuffer;
        v8x16* pInputPtrEnd = (v8x16*)((uint8_t*)inputDataBuffer + length);
        v8x16* pOutputPtr = (v8x16*) inputDataBuffer;
        __m128i rounder = _mm_cvtsi32_si128(0x80808080);
        v8x16 zero;
        zero ^= zero;
        __m128i multiplier = *((__m128i*)multiplierArray);
//      v16x8 multiplier = wasm_i64x2_splat(0x1001d0096004d);
        unsigned i = 0;
        for (; (i+4)*sizeof(__m128i)<= length; i+= 4) {
                v8x16 iv0 = wasm_v8x16_shuffle(pInputPtr[i/4],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv1 = wasm_v8x16_shuffle(pInputPtr[i/4+1],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv2 = wasm_v8x16_shuffle(pInputPtr[i/4+2],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv3 = wasm_v8x16_shuffle(pInputPtr[i/4+3],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                // rg ba rg ba rg ba rg ba rg ba rg ba rg ba
                __m128i rg0 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv0, (__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv0,(__m128i)zero),(__m128i)multiplier));
                __m128i rg1 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier));
                __m128i rg2 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier));
                __m128i rg3 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier));
                // rgba rgba rgba rgba rgba rgba rgba rgba
                __m128i rgba0 = wasm_u16x8_shr(_mm_hadd_epi16(rg0,rg1), 8);
                __m128i rgba1 = wasm_u16x8_shr(_mm_hadd_epi16(rg2,rg3), 8);
                pOutputPtr[i/4] = wasm_u8x16_narrow_i16x8(rgba0,rgba1);
        }
        // abbreviated...
        return 0;
}

 .section    .text.rgba2y,"",@
    .hidden rgba2y                          # -- Begin function rgba2y
    .globl  rgba2y
    .type   rgba2y,@function
rgba2y:                                 # @rgba2y
.Lfunc_begin0:
    .loc    2 56 0                          # rgb2y-sample.cpp:56:0
    .functype   rgba2y (i32, i32) -> (i32)
    .local      i32, i32, v128, v128, v128, v128, v128, v128
# %bb.0:                                # %entry
    #DEBUG_VALUE: rgba2y:length <- %4
    #DEBUG_VALUE: rgba2y:pInputPtrEnd <- undef
    #DEBUG_VALUE: rgba2y:i <- 0
    #DEBUG_VALUE: rgba2y:inputDataBuffer <- %3
    #DEBUG_VALUE: rgba2y:pInputPtr <- %3
    #DEBUG_VALUE: rgba2y:pOutputPtr <- %3
    #DEBUG_VALUE: rgba2y:rounder <- undef
    #DEBUG_VALUE: rgba2y:zero <- undef
    #DEBUG_VALUE: rgba2y:multiplier <- undef
    block
.Ltmp0:
    .loc    2 68 30 prologue_end            # rgb2y-sample.cpp:68:30
    local.get   1
    i32.const   64
    i32.lt_u
.Ltmp1:
    .loc    2 68 2 is_stmt 0                # rgb2y-sample.cpp:68:2
    br_if       0                               # 0: down to label0
.Ltmp2:
# %bb.1:
    .loc    2 0 2                           # rgb2y-sample.cpp:0:2
    i32.const   0
    local.set   2
    i32.const   4
    local.set   3
.LBB0_2:                                # %for.body
                                        # =>This Inner Loop Header: Depth=1
    loop                                        # label1:
.Ltmp3:
    #DEBUG_VALUE: rgba2y:i <- %101
    #DEBUG_VALUE: rgba0 <- undef
    #DEBUG_VALUE: rgba1 <- undef
    .loc    2 69 15 is_stmt 1               # rgb2y-sample.cpp:69:15
    local.get   0
    local.get   2
    i32.const   2
    i32.shl
    i32.add
    local.tee   2
    local.get   2
    v128.load   0
    i32.const   0
    i8x16.splat
    local.tee   4
    i32.const   -128
    i8x16.replace_lane  0
    i32.const   -128
    i8x16.replace_lane  1
    i32.const   -128
    i8x16.replace_lane  2
    i32.const   -128
    i8x16.replace_lane  3
    local.tee   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp4:
    .loc    2 74 48                         # rgb2y-sample.cpp:74:48
    local.tee   6
.Ltmp5:
    #DEBUG_VALUE: iv0 <- undef
    #DEBUG_VALUE: iv0 <- %153
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    i32.const   77
    .loc    2 74 32 is_stmt 0               # rgb2y-sample.cpp:74:32
    i16x8.splat
    i32.const   150
    i16x8.replace_lane  1
    i32.const   29
    i16x8.replace_lane  2
    i32.const   1
    i16x8.replace_lane  3
    i32.const   160
    i16x8.replace_lane  5
    i32.const   29
    i16x8.replace_lane  6
    i32.const   1
    i16x8.replace_lane  7
    local.tee   7
    i16x8.mul
    .loc    2 74 133                        # rgb2y-sample.cpp:74:133
    local.tee   8
    local.get   6
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 74 117                        # rgb2y-sample.cpp:74:117
    local.get   7
    i16x8.mul
    .loc    2 74 17                         # rgb2y-sample.cpp:74:17
    local.tee   6
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   6
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp6:
    .loc    2 0 17                          # rgb2y-sample.cpp:0:17
    local.tee   6
.Ltmp7:
    #DEBUG_VALUE: rg0 <- undef
    #DEBUG_VALUE: rg0 <- %153
    .loc    2 70 15 is_stmt 1               # rgb2y-sample.cpp:70:15
    local.get   2
    i32.const   16
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp8:
    .loc    2 75 62                         # rgb2y-sample.cpp:75:62
    local.tee   8
.Ltmp9:
    #DEBUG_VALUE: iv1 <- undef
    #DEBUG_VALUE: iv1 <- %157
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 75 46 is_stmt 0               # rgb2y-sample.cpp:75:46
    local.get   7
    i16x8.mul
    .loc    2 75 146                        # rgb2y-sample.cpp:75:146
    local.tee   9
    local.get   8
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 75 130                        # rgb2y-sample.cpp:75:130
    local.get   7
    i16x8.mul
    .loc    2 75 31                         # rgb2y-sample.cpp:75:31
    local.tee   8
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   9
    local.get   8
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp10:
    .loc    2 79 33 is_stmt 1               # rgb2y-sample.cpp:79:33
    local.tee   8
.Ltmp11:
    #DEBUG_VALUE: rg1 <- undef
    #DEBUG_VALUE: rg1 <- %157
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   6
    local.get   8
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
    i32.const   8
    .loc    2 79 18 is_stmt 0               # rgb2y-sample.cpp:79:18
    i16x8.shr_u
    .loc    2 71 15 is_stmt 1               # rgb2y-sample.cpp:71:15
    local.get   2
    i32.const   32
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp12:
    .loc    2 76 62                         # rgb2y-sample.cpp:76:62
    local.tee   6
.Ltmp13:
    #DEBUG_VALUE: iv2 <- undef
    #DEBUG_VALUE: iv2 <- %153
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 76 46 is_stmt 0               # rgb2y-sample.cpp:76:46
    local.get   7
    i16x8.mul
    .loc    2 76 146                        # rgb2y-sample.cpp:76:146
    local.tee   8
    local.get   6
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 76 130                        # rgb2y-sample.cpp:76:130
    local.get   7
    i16x8.mul
    .loc    2 76 31                         # rgb2y-sample.cpp:76:31
    local.tee   6
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   6
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp14:
    .loc    2 0 31                          # rgb2y-sample.cpp:0:31
    local.tee   6
.Ltmp15:
    #DEBUG_VALUE: rg2 <- undef
    #DEBUG_VALUE: rg2 <- %153
    .loc    2 72 15 is_stmt 1               # rgb2y-sample.cpp:72:15
    local.get   2
    i32.const   48
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp16:
    .loc    2 77 62                         # rgb2y-sample.cpp:77:62
    local.tee   5
.Ltmp17:
    #DEBUG_VALUE: iv3 <- undef
    #DEBUG_VALUE: iv3 <- %98
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 77 46 is_stmt 0               # rgb2y-sample.cpp:77:46
    local.get   7
    i16x8.mul
    .loc    2 77 146                        # rgb2y-sample.cpp:77:146
    local.tee   8
    local.get   5
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 77 130                        # rgb2y-sample.cpp:77:130
    local.get   7
    i16x8.mul
    .loc    2 77 31                         # rgb2y-sample.cpp:77:31
    local.tee   4
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   4
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp18:
    .loc    2 80 33 is_stmt 1               # rgb2y-sample.cpp:80:33
    local.tee   4
.Ltmp19:
    #DEBUG_VALUE: rg3 <- undef
    #DEBUG_VALUE: rg3 <- %93
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   6
    local.get   4
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
    i32.const   8
    .loc    2 80 18 is_stmt 0               # rgb2y-sample.cpp:80:18
    i16x8.shr_u
    .loc    2 81 21 is_stmt 1               # rgb2y-sample.cpp:81:21
    i8x16.narrow_i16x8_u
    .loc    2 81 19 is_stmt 0               # rgb2y-sample.cpp:81:19
    v128.store  0
.Ltmp20:
    #DEBUG_VALUE: rgba2y:i <- %170
    .loc    2 0 19                          # rgb2y-sample.cpp:0:19
    local.get   3
    local.tee   3
    local.set   2
.Ltmp21:
    .loc    2 68 11 is_stmt 1               # rgb2y-sample.cpp:68:11
    local.get   3
    i32.const   4
    i32.add
    local.tee   3
    i32.const   4
    .loc    2 68 14 is_stmt 0               # rgb2y-sample.cpp:68:14
    i32.shl
    .loc    2 68 30                         # rgb2y-sample.cpp:68:30
    local.get   1
    i32.le_u
.Ltmp22:
    .loc    2 68 2                          # rgb2y-sample.cpp:68:2
    br_if       0                               # 0: up to label1
.Ltmp23:
.LBB0_3:                                # %for.end
    end_loop
    end_block                               # label0:
    i32.const   0
.Ltmp24:
    .loc    2 84 2 is_stmt 1                # rgb2y-sample.cpp:84:2
                                        # fallthrough-return
    end_function
.Ltmp25:
.Lfunc_end0:
    .size   rgba2y, .Lfunc_end0-rgba2y



md5-0fc10999e8a5de4fc0bc444109b7b851



--- WebAssembly code ---
index: 2
kind: wasm function
compiler: TurboFan
Body (size = 1088 = 1086 + 2 padding)
Instructions (size = 1064)
0xa5976359180     0  55             push rbp
0xa5976359181     1  4889e5         REX.W movq rbp,rsp
0xa5976359184     4  6a0a           push 0xa
0xa5976359186     6  56             push rsi
0xa5976359187     7  4883ec58       REX.W subq rsp,0x58
0xa597635918b     b  488b5e17       REX.W movq rbx,[rsi+0x17]
0xa597635918f     f  83fa40         cmpl rdx,0x40
0xa5976359192    12  0f8307000000   jnc 0xa597635919f  <+0x1f>
0xa5976359198    18  33c9           xorl rcx,rcx
0xa597635919a    1a  e990030000     jmp 0xa597635952f  <+0x3af>
0xa597635919f    1f  b94d000000     movl rcx,0x4d
0xa59763591a4    24  c5f96ec1       vmovd xmm0,rcx
0xa59763591a8    28  c5fb70c000     vpshuflw xmm0,xmm0,0x0
0xa59763591ad    2d  c5f970c000     vpshufd xmm0,xmm0,0x0
0xa59763591b2    32  33c9           xorl rcx,rcx
0xa59763591b4    34  c5f96ec9       vmovd xmm1,rcx
0xa59763591b8    38  c4410057ff     vxorps xmm15,xmm15,xmm15
0xa59763591bd    3d  c4c27100cf     vpshufb xmm1,xmm1,xmm15
0xa59763591c2    42  bf96000000     movl rdi,0x96
0xa59763591c7    47  c5f9c4c701     vpinsrw xmm0,xmm0,rdi,0x1
0xa59763591cc    4c  bf80ffffff     movl rdi,0xffffff80
0xa59763591d1    51  c5f928d1       vmovapd xmm2,xmm1
0xa59763591d5    55  c4e36920d700   vpinsrb xmm2,xmm2,dil,0x0
0xa59763591db    5b  41b81d000000   movl r8,0x1d
0xa59763591e1    61  c4c179c4c002   vpinsrw xmm0,xmm0,r8,0x2
0xa59763591e7    67  c4e36920d701   vpinsrb xmm2,xmm2,dil,0x1
0xa59763591ed    6d  41b901000000   movl r9,0x1
0xa59763591f3    73  c4c179c4c103   vpinsrw xmm0,xmm0,r9,0x3
0xa59763591f9    79  c4e36920d702   vpinsrb xmm2,xmm2,dil,0x2
0xa59763591ff    7f  41bba0000000   movl r11,0xa0
0xa5976359205    85  c4c179c4c305   vpinsrw xmm0,xmm0,r11,0x5
0xa597635920b    8b  c4e36920d703   vpinsrb xmm2,xmm2,dil,0x3
0xa5976359211    91  c4c179c4c006   vpinsrw xmm0,xmm0,r8,0x6
0xa5976359217    97  c4c179c4c107   vpinsrw xmm0,xmm0,r9,0x7
0xa597635921d    9d  488bf9         REX.W movq rdi,rcx
0xa5976359220    a0  41b804000000   movl r8,0x4
0xa5976359226    a6  e90b000000     jmp 0xa5976359236  <+0xb6>
0xa597635922b    ab  0f1f440000     nop
0xa5976359230    b0  498bf8         REX.W movq rdi,r8
0xa5976359233    b3  4d8bc1         REX.W movq r8,r9
0xa5976359236    b6  4c8b4e2f       REX.W movq r9,[rsi+0x2f]
0xa597635923a    ba  493b21         REX.W cmpq rsp,[r9]
0xa597635923d    bd  0f86f4020000   jna 0xa5976359537  <+0x3b7>
0xa5976359243    c3  458d4804       leal r9,[r8+0x4]
0xa5976359247    c7  4d8bd9         REX.W movq r11,r9
0xa597635924a    ca  41c1e304       shll r11, 4
0xa597635924e    ce  8d3cb8         leal rdi,[rax+rdi*4]
0xa5976359251    d1  c5fa6f1c3b     vmovdqu xmm3,[rbx+rdi*1]
0xa5976359256    d6  c5fa6f641f10   vmovdqu xmm4,[rdi+rbx*1+0x10]
0xa597635925c    dc  c5fa6f6c1f20   vmovdqu xmm5,[rdi+rbx*1+0x20]
0xa5976359262    e2  c5fa6f741f30   vmovdqu xmm6,[rdi+rbx*1+0x30]
0xa5976359268    e8  c57810fe       vmovups xmm15,xmm6
0xa597635926c    ec  49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xa5976359276    f6  c441f96ec2     vmovq xmm8,r10
0xa597635927b    fb  49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xa5976359285   105  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635928b   10b  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359290   110  0f10fa         movups xmm7,xmm2
0xa5976359293   113  49ba8080800080808000 REX.W movq r10,0x80808000808080
0xa597635929d   11d  c441f96ec2     vmovq xmm8,r10
0xa59763592a2   122  4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xa59763592a9   129  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592af   12f  c4c24100f8     vpshufb xmm7,xmm7,xmm8
0xa59763592b4   134  c4c141ebff     vpor xmm7,xmm7,xmm15
0xa59763592b9   139  c57810fd       vmovups xmm15,xmm5
0xa59763592bd   13d  4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xa59763592c4   144  c441f96ec2     vmovq xmm8,r10
0xa59763592c9   149  4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xa59763592d0   150  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592d6   156  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa59763592db   15b  0f10f2         movups xmm6,xmm2
0xa59763592de   15e  4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xa59763592e5   165  c441f96ec2     vmovq xmm8,r10
0xa59763592ea   16a  4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xa59763592f1   171  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592f7   177  c4c24900f0     vpshufb xmm6,xmm6,xmm8
0xa59763592fc   17c  c4c149ebf7     vpor xmm6,xmm6,xmm15
0xa5976359301   181  c57810fc       vmovups xmm15,xmm4
0xa5976359305   185  4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xa597635930c   18c  c441f96ec2     vmovq xmm8,r10
0xa5976359311   191  4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xa5976359318   198  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635931e   19e  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359323   1a3  0f10ea         movups xmm5,xmm2
0xa5976359326   1a6  4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xa597635932d   1ad  c441f96ec2     vmovq xmm8,r10
0xa5976359332   1b2  4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xa5976359339   1b9  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635933f   1bf  c4c25100e8     vpshufb xmm5,xmm5,xmm8
0xa5976359344   1c4  c4c151ebef     vpor xmm5,xmm5,xmm15
0xa5976359349   1c9  c57810fb       vmovups xmm15,xmm3
0xa597635934d   1cd  4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xa5976359354   1d4  c441f96ec2     vmovq xmm8,r10
0xa5976359359   1d9  4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xa5976359360   1e0  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359366   1e6  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa597635936b   1eb  0f10e2         movups xmm4,xmm2
0xa597635936e   1ee  4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xa5976359375   1f5  c441f96ec2     vmovq xmm8,r10
0xa597635937a   1fa  4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xa5976359381   201  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359387   207  c4c25900e0     vpshufb xmm4,xmm4,xmm8
0xa597635938c   20c  c4c159ebe7     vpor xmm4,xmm4,xmm15
0xa5976359391   211  c5f928df       vmovapd xmm3,xmm7
0xa5976359395   215  c5e168d9       vpunpckhbw xmm3,xmm3,xmm1
0xa5976359399   219  c5c160f9       vpunpcklbw xmm7,xmm7,xmm1
0xa597635939d   21d  c57928c6       vmovapd xmm8,xmm6
0xa59763593a1   221  c53968c1       vpunpckhbw xmm8,xmm8,xmm1
0xa59763593a5   225  c5c960f1       vpunpcklbw xmm6,xmm6,xmm1
0xa59763593a9   229  c57928cd       vmovapd xmm9,xmm5
0xa59763593ad   22d  c53168c9       vpunpckhbw xmm9,xmm9,xmm1
0xa59763593b1   231  c5d160e9       vpunpcklbw xmm5,xmm5,xmm1
0xa59763593b5   235  c57928d4       vmovapd xmm10,xmm4
0xa59763593b9   239  c52968d1       vpunpckhbw xmm10,xmm10,xmm1
0xa59763593bd   23d  c5d960e1       vpunpcklbw xmm4,xmm4,xmm1
0xa59763593c1   241  c5e1d5d8       vpmullw xmm3,xmm3,xmm0
0xa59763593c5   245  c5c1d5f8       vpmullw xmm7,xmm7,xmm0
0xa59763593c9   249  c539d5c0       vpmullw xmm8,xmm8,xmm0
0xa59763593cd   24d  c5c9d5f0       vpmullw xmm6,xmm6,xmm0
0xa59763593d1   251  c531d5c8       vpmullw xmm9,xmm9,xmm0
0xa59763593d5   255  c5d1d5e8       vpmullw xmm5,xmm5,xmm0
0xa59763593d9   259  c529d5d0       vpmullw xmm10,xmm10,xmm0
0xa59763593dd   25d  c5d9d5e0       vpmullw xmm4,xmm4,xmm0
0xa59763593e1   261  c57928df       vmovapd xmm11,xmm7
0xa59763593e5   265  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763593ea   26a  c463010efb55   vpblendw xmm15,xmm15,xmm3,0x55
0xa59763593f0   270  c443210edfaa   vpblendw xmm11,xmm11,xmm15,0xaa
0xa59763593f6   276  c442212bdf     vpackusdw xmm11,xmm11,xmm15
0xa59763593fb   27b  c57810fb       vmovups xmm15,xmm3
0xa59763593ff   27f  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359405   285  c5c172d710     vpsrld xmm7,xmm7,16
0xa597635940a   28a  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa597635940f   28f  c5f928de       vmovapd xmm3,xmm6
0xa5976359413   293  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359418   298  c443010ef855   vpblendw xmm15,xmm15,xmm8,0x55
0xa597635941e   29e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa5976359424   2a4  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa5976359429   2a9  c4417810f8     vmovups xmm15,xmm8
0xa597635942e   2ae  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359434   2b4  c5c972d610     vpsrld xmm6,xmm6,16
0xa5976359439   2b9  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa597635943e   2be  c57928c5       vmovapd xmm8,xmm5
0xa5976359442   2c2  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359447   2c7  c443010ef955   vpblendw xmm15,xmm15,xmm9,0x55
0xa597635944d   2cd  c443390ec7aa   vpblendw xmm8,xmm8,xmm15,0xaa
0xa5976359453   2d3  c442392bc7     vpackusdw xmm8,xmm8,xmm15
0xa5976359458   2d8  c4417810f9     vmovups xmm15,xmm9
0xa597635945d   2dd  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359463   2e3  c5d172d510     vpsrld xmm5,xmm5,16
0xa5976359468   2e8  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xa597635946d   2ed  c57928cc       vmovapd xmm9,xmm4
0xa5976359471   2f1  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359476   2f6  c443010efa55   vpblendw xmm15,xmm15,xmm10,0x55
0xa597635947c   2fc  c443310ecfaa   vpblendw xmm9,xmm9,xmm15,0xaa
0xa5976359482   302  c442312bcf     vpackusdw xmm9,xmm9,xmm15
0xa5976359487   307  c4417810fa     vmovups xmm15,xmm10
0xa597635948c   30c  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359492   312  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359497   317  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635949c   31c  c4c141fdfb     vpaddw xmm7,xmm7,xmm11
0xa59763594a1   321  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa59763594a5   325  c4c151fde8     vpaddw xmm5,xmm5,xmm8
0xa59763594aa   32a  c4c159fde1     vpaddw xmm4,xmm4,xmm9
0xa59763594af   32f  c5f928de       vmovapd xmm3,xmm6
0xa59763594b3   333  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594b8   338  c463010eff55   vpblendw xmm15,xmm15,xmm7,0x55
0xa59763594be   33e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa59763594c4   344  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa59763594c9   349  c57810ff       vmovups xmm15,xmm7
0xa59763594cd   34d  c4c10172d710   vpsrld xmm15,xmm15,16
0xa59763594d3   353  c5c972d610     vpsrld xmm6,xmm6,16
0xa59763594d8   358  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa59763594dd   35d  c5f928fc       vmovapd xmm7,xmm4
0xa59763594e1   361  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594e6   366  c463010efd55   vpblendw xmm15,xmm15,xmm5,0x55
0xa59763594ec   36c  c4c3410effaa   vpblendw xmm7,xmm7,xmm15,0xaa
0xa59763594f2   372  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa59763594f7   377  c57810fd       vmovups xmm15,xmm5
0xa59763594fb   37b  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359501   381  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359506   386  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635950b   38b  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa597635950f   38f  c5d9fde7       vpaddw xmm4,xmm4,xmm7
0xa5976359513   393  c5c971d608     vpsrlw xmm6,xmm6,8
0xa5976359518   398  c5d971d408     vpsrlw xmm4,xmm4,8
0xa597635951d   39d  c5d967e6       vpackuswb xmm4,xmm4,xmm6
0xa5976359521   3a1  c5fa7f243b     vmovdqu [rbx+rdi*1],xmm4
0xa5976359526   3a6  443bda         cmpl r11,rdx
0xa5976359529   3a9  0f8601fdffff   jna 0xa5976359230  <+0xb0>
0xa597635952f   3af  488bc1         REX.W movq rax,rcx
0xa5976359532   3b2  488be5         REX.W movq rsp,rbp
0xa5976359535   3b5  5d             pop rbp
0xa5976359536   3b6  c3             retl
0xa5976359537   3b7  488955e8       REX.W movq [rbp-0x18],rdx
0xa597635953b   3bb  48895de0       REX.W movq [rbp-0x20],rbx
0xa597635953f   3bf  c5f81145d0     vmovups [rbp-0x30],xmm0
0xa5976359544   3c4  c5f8114dc0     vmovups [rbp-0x40],xmm1
0xa5976359549   3c9  c5f81155b0     vmovups [rbp-0x50],xmm2
0xa597635954e   3ce  488945a8       REX.W movq [rbp-0x58],rax
0xa5976359552   3d2  48897da0       REX.W movq [rbp-0x60],rdi
0xa5976359556   3d6  4c894598       REX.W movq [rbp-0x68],r8
0xa597635955a   3da  e8615dffff     call 0xa597634f2c0       ;; wasm stub: WasmStackGuard
0xa597635955f   3df  33c9           xorl rcx,rcx
0xa5976359561   3e1  488b55e8       REX.W movq rdx,[rbp-0x18]
0xa5976359565   3e5  488b5de0       REX.W movq rbx,[rbp-0x20]
0xa5976359569   3e9  c5f81045d0     vmovups xmm0,[rbp-0x30]
0xa597635956e   3ee  c5f8104dc0     vmovups xmm1,[rbp-0x40]
0xa5976359573   3f3  c5f81055b0     vmovups xmm2,[rbp-0x50]
0xa5976359578   3f8  488b45a8       REX.W movq rax,[rbp-0x58]
0xa597635957c   3fc  488b7da0       REX.W movq rdi,[rbp-0x60]
0xa5976359580   400  4c8b4598       REX.W movq r8,[rbp-0x68]
0xa5976359584   404  488b75f0       REX.W movq rsi,[rbp-0x10]
0xa5976359588   408  e9b6fcffff     jmp 0xa5976359243  <+0xc3>
0xa597635958d   40d  e8fe5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359592   412  e8f95affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359597   417  e8f45affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa597635959c   41c  e8ef5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a1   421  e8ea5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a6   426  90             nop
0xa59763595a7   427  90             nop

Protected instructions:
 pc offset  land pad
       3a1       40d
        e2       412
        dc       417
        d6       41c
        d1       421

Source positions:
 pc offset  position
        d1        43
        d6       239
        dc       416
        e2       545
       3a1       722
       3b7        29
       40d       722
       412       545
       417       416
       41c       239
       421        43

Safepoints (size = 22)
0xa5a7635917fffffffff  000000000000000 (sp -> fp)

RelocInfo (size = 8)
0xa597635955b  wasm stub call
0xa597635958e  wasm stub call
0xa5976359593  wasm stub call
0xa5976359598  wasm stub call
0xa597635959d  wasm stub call
0xa59763595a2  wasm stub call

--- End code ---

Most helpful comment

Just to summarize... I'm asking for this help because it looks like using WebAssembly SIMD with Node JS 14 could provide a more portable solution for the libraries I'm maintaining than a custom C++ library. Personally, I have no intended browser cases, however, hey, why should I cut support if the performance is equivalent?

All 37 comments

The reason we don't use v128.const for this is that v128.const was only recently implemented in V8. To avoid breaking origin trial users, we can't update LLVM to emit v128.const until the relevant V8 patches roll into Chrome stable. I'm keeping an eye on this dashboard to determine when will be a good time to make this change. If you're using a more recent build of Chrome or some other execution environment that does support v128.const, you can try compiling your project with the -munimplemented-simd128 flag, which will enable v128.const in LLVM (but might also introduce other changes that you don't want). Once v128.const is widely available, it will be better for LLVM to use v128.const than to load vectors from memory because that allows the engine to determine the best way to materialize vectors given the runtime platform.

It also might be worth considering porting performance-sensitive parts of your code to use the WebAssembly intrinsics header directly rather than relying on emulated SSE. That would reduce a layer of impedence mismatch between your code and the underlying machine code.

Finally, if you notice suboptimal instruction selection anywhere, it would be helpful if you could file LLVM bugs (if it's on the code -> wasm side) or V8 bugs (if it's on the wasm -> native side) about the specific issues you see. That kind of feedback is extremely valuable to us.

Hey Thomas,

Thanks for the quick response.

That sounds like an ideal solution -- but you must have a solution that exists for handling constant string data without v128.const. Why not use that?

Also, I'm totally fine with use direct wasm intrinsics -- except there is no hadd in the WASM SIMD proposal (even though it's pretty well supported across architectures). Right now it's two shuffles and an add -- which is fine I think.

Also, it's very easy for me to control the alignment of the data. Is there a way for me to instruct wasm that the data is aligned?

Dan

That sounds like an ideal solution -- but you must have a solution that exists for handling constant string data without v128.const. Why not use that?

It certainly would have been more optimal to use loads in the interim, but I wrote the code to use the simplest possible fallback rather than the optimal fallback since I knew that v128.const would be available before SIMD was standardized. If I had known how long it would be unavailable, I might have implemented constant loads instead, but it's probably not worth fixing now for just the remaining time before v128.const hits Chrome stable.

Also, it's very easy for me to control the alignment of the data. Is there a way for me to instruct wasm that the data is aligned?

When you load or store from a v128_t*, LLVM will assume that the access is aligned unless you tell it otherwise with an attribute. wasm_v128_load and other intrinsics from wasm_simd128.h, on the other hand, do not assume aligned accesses. Right now the only way to control the alignment hint on the other load instructions is to call the underlying target-specific builtins directly, which I don't recommend because they're not meant to be a stable interface.

That sounds like an ideal solution -- but you must have a solution that exists for handling constant string data without v128.const. Why not use that?

It certainly would have been more optimal to use loads in the interim, but I wrote the code to use the simplest possible fallback rather than the optimal fallback since I knew that v128.const would be available before SIMD was standardized. If I had known how long it would be unavailable, I might have implemented constant loads instead, but it's probably not worth fixing now for just the remaining time before v128.const hits Chrome stable.

I'm not sure I agree. You're worried about Chrome. I'm worried about Node. Unless something I don't know about happens, it looks like there will be a 0% chance that v128.const will make its support in to the next LTS release of Node (14). That means it could be years before we get that functionality.

Hmm, that's unfortunate. I'm not sure it's too different from the situation with other staging features, though, where if it's not stable and shipped when the LTS is cut, it probably shouldn't be used with the LTS. When does the LTS get cut, though? If it's not too soon, it might not be too difficult for me to slip this change into an Emscripten release before then. If we did that, you would still be stuck on the last version of Emscripten before the change to use v128.const by default for the lifetime of that Node LTS, though. I'm not sure that's any better than just considering this LTS to not support SIMD.

I'm pretty sure Node 14 LTS comes out later this year. It supports all SIMD features except the const...

LTS date: 2020-10-27

Also, it's very easy for me to control the alignment of the data. Is there a way for me to instruct wasm that the data is aligned?

When you load or store from a v128_t*, LLVM will assume that the access is aligned unless you tell it otherwise with an attribute. wasm_v128_load and other intrinsics from wasm_simd128.h, on the other hand, do not assume aligned accesses. Right now the only way to control the alignment hint on the other load instructions is to call the underlying target-specific builtins directly, which I don't recommend because they're not meant to be a stable interface.

In this case, it appears that v8 in my version of NodeJS is treating everything as unaligned. :-(

In this case, it appears that v8 in my version of NodeJS is treating everything as unaligned. :-(

In WebAssembly, alignment hints are only hints, so engines cannot depend on them to be correct. One implementation strategy is to install trap handlers to catch and work around alignment faults, but V8 uses the other strategy, which is to ignore the alignment hints and assume everything is unaligned.

Just to summarize... I'm asking for this help because it looks like using WebAssembly SIMD with Node JS 14 could provide a more portable solution for the libraries I'm maintaining than a custom C++ library. Personally, I have no intended browser cases, however, hey, why should I cut support if the performance is equivalent?

@tlively I'm looking at the link you left on stackoverflow -- https://github.com/llvm/llvm-project/blob/c193a689b475f91e63adb25dc5855f7a7f068c9a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp#L1451-L1610

Just for grins -- which branch/tag/repo do I need to pull from to create the pull request to fix this?

Emscripten uses tip of tree LLVM, so you can just clone https://github.com/llvm/llvm-project/tree/master/llvm. Unfortunately LLVM doesn't accept GitHub PRs, but if you want to send me any patch you come up with, I can take care of getting it reviewed and landed :)

Yay... Okay... I guess I first need to figure out how to create a read only global. Then I need to figure out a way to build and test it. Do you have a pointer to a guide on getting set up for this with emscripten?

I don't think there's a guide for LLVM development with Emscripten, but thankfully it's not too complicated. If you install emscripten via emsdk, it will automatically set up Emscripten itself as well as LLVM and Binaryen. There will be a .emscripten config file in your emsdk directory that you can edit to point to your custom build of LLVM, and everything should just work normally at that point.

The emscripten developer's guide may be helpful: https://emscripten.org/docs/contributing/developers_guide.html

This is really hard without any background in the internals of llvm. Do you happen to know where I can look for how it creates string constants and puts them in rodata?

String constants are a little different because they start out as global variables in llvm IR, so they're placed into memory via the normal data symbol mechanisms. I believe what you want here is to call DAG.getTargetConstantPool() in lowerBUILD_VECTOR to create a constant pool from which the v128 constant will be loaded. Looking into this a little more, it looks like the WebAssembly backend doesn't use constant pools for anything yet, so lowering them correctly would probably be a fair amount of extra work. I'm not actually sure what all the issues to resolve would be :(

Backing up a bit, are you constrained to using Node LTS or could you use a non-LTS version once that includes the fully standardized SIMD implementation?

I'm pretty constrained on LTS.

However, there's an alternative way to implement this without creating a constant pool if I can create a smart splat. It's not as efficient as a constant load, but it's significantly better than the current version.

Is the maximum number of operands for this operation limited to 16?

The code already chooses what value to splat such that the number of replace_lanes is minimized. Do you have an idea for how to improve on that? Yes, vectors can have at most 16 lanes.

The code already chooses what value to splat such that the number of replace_lanes is minimized. Do you have an idea for how to improve on that? Yes, vectors can have at most 16 lanes.

Yes. Any constant of 128 bits can be represented by two 64 bit values. Thus 1 load and 1 insert in the worst case. In the event of the two 64 but values matching, then it can be performed with exactly 1 splat. Neither are as efficient as loading from a constant but both are much better.

Oh right, that is a good idea! And it shouldn't require much interaction with LLVM APIs beyond what's already in that function.

Do we know the data type coming in from LLVM? Is it already lexed?--

This solution should work for all integer types and any 2 doubles. It's debatable about whether or not it could work for floats -- because even if we pack 64 bits of float data into a double, it's not really a double at that point. We _could_ convert them to integers representative of IEEE754 floats and pack them as integers, but it's not quite the right way to do it.

Yes, we already know the number and type of the lanes and we can tell if a lane is constant. Any logic to combine replace_lanes should go here. I would ignore floating point types for now and only try combining lanes for integers.

@tlively I went a different way of implementation that I thought would be a bit more suitable -- code below, but don't understand how to handle this particular error.

else if (NumConstantLanes >= NumSplatLanes &&
             Subtarget->hasUnimplementedSIMD128()) {
    SmallVector<SDValue, 16> ConstLanes;
    for (const SDValue &Lane : Op->op_values()) {
      if (IsConstant(Lane)) {
        ConstLanes.push_back(Lane);
      } else if (LaneT.isFloatingPoint()) {
        ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
      } else {
        ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
      }
    }
    Result = DAG.getBuildVector(VecT, DL, ConstLanes);
    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
      return IsConstant(Lane);
    };
  } else if (NumConstantLanes >= NumSplatLanes && (VecT == MVT::v16i8  || VecT == MVT::v8i16 || VecT == MVT::v4i32 || VecT == MVT::v2i64)) {

      std::array<uint64_t, 2> valuearray({0, 0});
      unsigned i = 0;
      size_t byteStep = VecT.getScalarType().getStoreSize().getFixedSize();
      for (const SDValue &Lane : Op->op_values()) {

        if (IsConstant(Lane)) {
            using llvm::support::endian::byte_swap;
            using llvm::support::little;
            uint8_t* pByteArray = reinterpret_cast<uint8_t*>(valuearray.data());
            // endianness of the compiler matters here.
            // little endian has least significant bit first which is advantageous for this.
            // so a 16 bit integer will be front loaded in a uint64 from byte packing perspective.

            ConstantSDNode *pConstantSDNode = cast<ConstantSDNode>(Lane.getNode());
            uint64_t val =  byte_swap(pConstantSDNode->getLimitedValue(), little);
            uint8_t* pVal = reinterpret_cast<uint8_t*>(&val);
            std::copy(pVal,pVal+byteStep,pByteArray+i*byteStep);
        }
        ++i;
    }




    Result = DAG.getSplatBuildVector(MVT::v2i64, DL, DAG.getConstant(valuearray[0], DL, MVT::i64));
    if (valuearray[0] != valuearray[1]) {
        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result, DAG.getConstant(valuearray[1], DL, MVT::i64),
                           DAG.getConstant(1, DL, MVT::i32));
    }

    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
      return IsConstant(Lane);
    };
  }

Error Message:

fatal error: error in backend: Cannot select: t442: v16i8 = WebAssemblyISD::SHUFFLE t390, t503, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, ./simdpp/detail/insn/permute4.h:100:12 @[ ./simdpp/core/permute4.h:106:12 @[ rgb2y-sample.cpp:182:26 ] ]
  t390: v16i8 = bitcast t389, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/core/permute4.h:105:22 @[ rgb2y-sample.cpp:182:26 ] ]
    t389: v4i32 = add t387, t388, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:181:12 ] ] ] ] ]
      t387: v4i32 = add t383, t386, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:180:12 ] ] ] ] ]
        t383: v4i32 = add t323, t382, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ]
          t323: v4i32 = mul nuw nsw t419, t420, ./simdpp/detail/insn/i_mul_lo.h:69:12 @[ ./simdpp/detail/expr/i_mul.h:29:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/generic.h:229:52 @[ rgb2y-sample.cpp:157:41 ] ] ] ] ]
            t419: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t428, rgb2y-sample.cpp:154:41
              t428: v8i16 = WebAssemblyISD::WIDEN_HIGH_U t17, rgb2y-sample.cpp:120:68
                t17: v16i8,ch = load<(load 16 from %ir.23, !tbaa !1934)> t0, t16, undef:i32, ./simdpp/detail/insn/load.h:34:9 @[ ./simdpp/detail/insn/load.h:182:5 @[ ./simdpp/detail/insn/load.h:191:9 @[ ./simdpp/detail/construct_eval.h:46:5 @[ ./simdpp/types/int8x16.h:150:9 @[ rgb2y-sample.cpp:38:31 ] ] ] ] ]
                  t16: i32 = add t15, t9, rgb2y-sample.cpp:38:67


                  t12: i32 = undef
            t420: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t430, rgb2y-sample.cpp:153:42
              t430: v8i16 = WebAssemblyISD::WIDEN_HIGH_U t13, rgb2y-sample.cpp:119:69
                t13: v16i8,ch = load<(load 16 from %ir.22, !tbaa !1934)> t0, t10, undef:i32, ./simdpp/detail/insn/load.h:34:9 @[ ./simdpp/detail/insn/load.h:182:5 @[ ./simdpp/detail/insn/load.h:191:9 @[ ./simdpp/detail/construct_eval.h:46:5 @[ ./simdpp/types/int8x16.h:150:9 @[ rgb2y-sample.cpp:37:32 ] ] ] ] ]
                  t10: i32 = add t7, t9, rgb2y-sample.cpp:37:68


                  t12: i32 = undef
          t382: v4i32 = bitcast t484, ./simdpp/types/empty_expr.h:207:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ] ]
            t484: v16i8 = WebAssemblyISD::SHUFFLE t503, t380, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<16>, Constant:i32<17>, Constant:i32<18>, Constant:i32<19>, Constant:i32<20>, Constant:i32<21>, Constant:i32<22>, Constant:i32<23>, Constant:i32<24>, Constant:i32<25>, Constant:i32<26>, Constant:i32<27>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:179:22 ] ] ] ]
              t503: v2i64 = BUILD_VECTOR Constant:i64<0>, Constant:i64<0>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:58:20 ] ] ] ]
                t502: i64 = Constant<0>
                t502: i64 = Constant<0>
              t380: v16i8 = bitcast t323, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/core/move_r.h:105:22 @[ rgb2y-sample.cpp:179:22 ] ]
                t323: v4i32 = mul nuw nsw t419, t420, ./simdpp/detail/insn/i_mul_lo.h:69:12 @[ ./simdpp/detail/expr/i_mul.h:29:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/generic.h:229:52 @[ rgb2y-sample.cpp:157:41 ] ] ] ] ]
                  t419: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t428, rgb2y-sample.cpp:154:41

                  t420: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t430, rgb2y-sample.cpp:153:42

              t37: i32 = Constant<12>
              t38: i32 = Constant<13>
              t39: i32 = Constant<14>
              t40: i32 = Constant<15>
              t41: i32 = Constant<16>
              t42: i32 = Constant<17>
              t43: i32 = Constant<18>
              t44: i32 = Constant<19>
              t45: i32 = Constant<20>
              t46: i32 = Constant<21>
              t47: i32 = Constant<22>
              t48: i32 = Constant<23>
              t49: i32 = Constant<24>
              t50: i32 = Constant<25>
              t51: i32 = Constant<26>
              t52: i32 = Constant<27>
        t386: v4i32 = bitcast t464, ./simdpp/types/empty_expr.h:207:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:180:12 ] ] ] ] ] ]
          t464: v16i8 = WebAssemblyISD::SHUFFLE t503, t384, Constant:i32<8>, Constant:i32<9>, Constant:i32<10>, Constant:i32<11>, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<16>, Constant:i32<17>, Constant:i32<18>, Constant:i32<19>, Constant:i32<20>, Constant:i32<21>, Constant:i32<22>, Constant:i32<23>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:180:22 ] ] ] ]
            t503: v2i64 = BUILD_VECTOR Constant:i64<0>, Constant:i64<0>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:58:20 ] ] ] ]
              t502: i64 = Constant<0>
              t502: i64 = Constant<0>
            t384: v16i8 = bitcast t383, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/core/move_r.h:105:22 @[ rgb2y-sample.cpp:180:22 ] ]
              t383: v4i32 = add t323, t382, ./simdpp/detail/insn/i_add.h:99:12 @[ ./simdpp/detail/expr/i_add.h:28:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ]
                t323: v4i32 = mul nuw nsw t419, t420, ./simdpp/detail/insn/i_mul_lo.h:69:12 @[ ./simdpp/detail/expr/i_mul.h:29:20 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/generic.h:229:52 @[ rgb2y-sample.cpp:157:41 ] ] ] ] ]
                  t419: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t428, rgb2y-sample.cpp:154:41

                  t420: v4i32 = WebAssemblyISD::WIDEN_HIGH_U t430, rgb2y-sample.cpp:153:42

                t382: v4i32 = bitcast t484, ./simdpp/types/empty_expr.h:207:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:179:12 ] ] ] ] ] ]
                  t484: v16i8 = WebAssemblyISD::SHUFFLE t503, t380, Constant:i32<12>, Constant:i32<13>, Constant:i32<14>, Constant:i32<15>, Constant:i32<16>, Constant:i32<17>, Constant:i32<18>, Constant:i32<19>, Constant:i32<20>, Constant:i32<21>, Constant:i32<22>, Constant:i32<23>, Constant:i32<24>, Constant:i32<25>, Constant:i32<26>, Constant:i32<27>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:179:22 ] ] ] ]


















            t57: i32 = Constant<8>
            t58: i32 = Constant<9>
            t59: i32 = Constant<10>
            t60: i32 = Constant<11>
            t37: i32 = Constant<12>
            t38: i32 = Constant<13>
            t39: i32 = Constant<14>
            t40: i32 = Constant<15>
            t41: i32 = Constant<16>
            t42: i32 = Constant<17>
            t43: i32 = Constant<18>
            t44: i32 = Constant<19>
            t45: i32 = Constant<20>
            t46: i32 = Constant<21>
            t47: i32 = Constant<22>
            t48: i32 = Constant<23>
      t388: v4i32,ch = load<(dereferenceable load 16 from %ir.17)> t379, TargetFrameIndex:i32<4>, undef:i32, ./simdpp/types/int32x4.h:161:48 @[ ./simdpp/detail/eval_scalar.h:25:57 @[ ./simdpp/detail/expr/i_add.h:30:17 @[ ./simdpp/expr.inl:43:47 @[ ./simdpp/types/generic.h:231:14 @[ ./simdpp/types/int32x4.h:134:49 @[ rgb2y-sample.cpp:181:12 ] ] ] ] ] ]
        t508: i32 = TargetFrameIndex<4>
        t12: i32 = undef
  t503: v2i64 = BUILD_VECTOR Constant:i64<0>, Constant:i64<0>, ./simdpp/detail/insn/move_r.h:33:12 @[ ./simdpp/detail/insn/move_r.h:115:24 @[ ./simdpp/detail/insn/move_r.h:299:52 @[ ./simdpp/core/move_r.h:106:12 @[ rgb2y-sample.cpp:58:20 ] ] ] ]
    t502: i64 = Constant<0>
    t502: i64 = Constant<0>
  t37: i32 = Constant<12>
  t38: i32 = Constant<13>
  t39: i32 = Constant<14>
  t40: i32 = Constant<15>
  t37: i32 = Constant<12>
  t38: i32 = Constant<13>
  t39: i32 = Constant<14>
  t40: i32 = Constant<15>
  t37: i32 = Constant<12>
  t38: i32 = Constant<13>
  t39: i32 = Constant<14>
  t40: i32 = Constant<15>
  t37: i32 = Constant<12>
  t38: i32 = Constant<13>
  t39: i32 = Constant<14>
  t40: i32 = Constant<15>
In function: computeSumMatrixForwardSimd2PassAll
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.      Program arguments: /home/dan/git/llvm-project/llvm/build/bin/clang++ -target wasm32-unknown-emscripten -D__EMSCRIPTEN_major__=2 -D__EMSCRIPTEN_minor__=0 -D__EMSCRIPTEN_tiny__=4 -D_LIBCPP_ABI_VERSION=2 -Dunix -D__unix -D__unix__ -Werror=implicit-function-declaration -Xclang -nostdsysteminc -D__SSE__=1 -D__SSE2__=1 -D__SSE3__=1 -D__SSSE3__=1 -D__SSE4_1__=1 -D__SSE4_2__=1 -D__AVX__=1 -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/libcxx -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/libcxxabi/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/compat -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/libc -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/libc/musl/arch/emscripten -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/local/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/SSE -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/neon -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/compiler-rt/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/lib/libunwind/include -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/cache/wasm/include -DEMSCRIPTEN -fignore-exceptions -g -O3 -msimd128 -I. -S rgb2y-sample.cpp -Xclang -isystem/home/dan/applications/emsdk/upstream/emscripten/system/include/SDL -c -o rgb2y-sample.o -mllvm -combiner-global-alias-analysis=false -mllvm -enable-emscripten-sjlj -mllvm -disable-lsr 
1.      <eof> parser at end of file
2.      Code generation
3.      Running pass 'Function Pass Manager' on module 'rgb2y-sample.cpp'.
4.      Running pass 'WebAssembly Instruction Selection' on function '@computeSumMatrixForwardSimd2PassAll'
 #0 0x0000556d8349b510 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1ea0510)
 #1 0x0000556d83499284 llvm::sys::RunSignalHandlers() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e9e284)
 #2 0x0000556d83499501 llvm::sys::CleanupOnSignal(unsigned long) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e9e501)
 #3 0x0000556d83404e83 llvm::CrashRecoveryContext::HandleExit(int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e09e83)
 #4 0x0000556d83491aab llvm::sys::Process::Exit(int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e96aab)
 #5 0x0000556d82175f01 LLVMErrorHandler(void*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb7af01)
 #6 0x0000556d8340c1ac llvm::report_fatal_error(llvm::Twine const&, bool) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e111ac)
 #7 0x0000556d8340c324 (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e11324)
 #8 0x0000556d843232aa llvm::SelectionDAGISel::CannotYetSelect(llvm::SDNode*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d282aa)
 #9 0x0000556d84324702 llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d29702)
#10 0x0000556d821cf10b (anonymous namespace)::WebAssemblyDAGToDAGISel::Select(llvm::SDNode*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xbd410b)
#11 0x0000556d843210bb llvm::SelectionDAGISel::DoInstructionSelection() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d260bb)
#12 0x0000556d84329f29 llvm::SelectionDAGISel::CodeGenAndEmitDAG() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d2ef29)
#13 0x0000556d8433004b llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d3504b)
#14 0x0000556d8433195d llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (.part.0) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2d3695d)
#15 0x0000556d828f6e49 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x12fbe49)
#16 0x0000556d82db4d71 llvm::FPPassManager::runOnFunction(llvm::Function&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x17b9d71)
#17 0x0000556d82db5469 llvm::FPPassManager::runOnModule(llvm::Module&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x17ba469)
#18 0x0000556d82db3f0c llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x17b8f0c)
#19 0x0000556d8375e284 (anonymous namespace)::EmitAssemblyHelper::EmitAssembly(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2163284)
#20 0x0000556d8375fc2d clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2164c2d)
#21 0x0000556d84454159 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2e59159)
#22 0x0000556d85323f89 clang::ParseAST(clang::Sema&, bool, bool) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x3d28f89)
#23 0x0000556d84452b88 clang::CodeGenAction::ExecuteAction() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2e57b88)
#24 0x0000556d83d97599 clang::FrontendAction::Execute() (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x279c599)
#25 0x0000556d83d4dcc6 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2752cc6)
#26 0x0000556d83e6bb40 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2870b40)
#27 0x0000556d82176c2f cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb7bc2f)
#28 0x0000556d82173f98 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb78f98)
#29 0x0000556d83c0b639 void llvm::function_ref<void ()>::callback_fn<clang::driver::CC1Command::Execute(llvm::ArrayRef<llvm::Optional<llvm::StringRef> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, bool*) const::'lambda'()>(long) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2610639)
#30 0x0000556d83404d4c llvm::CrashRecoveryContext::RunSafely(llvm::function_ref<void ()>) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x1e09d4c)
#31 0x0000556d83c0bf56 clang::driver::CC1Command::Execute(llvm::ArrayRef<llvm::Optional<llvm::StringRef> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, bool*) const (.part.0) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x2610f56)
#32 0x0000556d83be2a9c clang::driver::Compilation::ExecuteCommand(clang::driver::Command const&, clang::driver::Command const*&) const (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x25e7a9c)
#33 0x0000556d83be33d6 clang::driver::Compilation::ExecuteJobs(clang::driver::JobList const&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*> >&) const (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x25e83d6)
#34 0x0000556d83becb49 clang::driver::Driver::ExecuteCompilation(clang::driver::Compilation&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*> >&) (/home/dan/git/llvm-project/llvm/build/bin/clang+++0x25f1b49)
#35 0x0000556d820fb0e0 main (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb000e0)
#36 0x00007fc609ec90b3 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x270b3)
#37 0x0000556d82173aee _start (/home/dan/git/llvm-project/llvm/build/bin/clang+++0xb78aee)
clang-12: error: clang frontend command failed with exit code 70 (use -v to see invocation)
clang version 12.0.0 ([email protected]:llvm/llvm-project d6ac649ccda289ecc2d2c0cb51892d57e8ec328c)
Target: wasm32-unknown-emscripten
Thread model: posix
InstalledDir: /home/dan/git/llvm-project/llvm/build/bin
clang-12: note: diagnostic msg: 
********************

Got it. Needed a bitcast to the original vector type before returning. Will send a patch once I'm done testing it.

Patch is below. It's not a substitute for rodata, but it's a big improvement.

From 6d7f93a00a83636c54145c40c674b544d508f815 Mon Sep 17 00:00:00 2001
From: Dan Weber <[email protected]>
Date: Tue, 29 Sep 2020 17:13:02 +0000
Subject: [PATCH] fix(constants generation): Use integers to load complex
 contants without replace_lane

---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 425f8b86c..6dec5e456 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
@@ -1579,6 +1580,34 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
       return IsConstant(Lane);
     };
+  } else if (NumConstantLanes >= NumSplatLanes && (VecT == MVT::v16i8  || VecT == MVT::v8i16 || VecT == MVT::v4i32 || VecT == MVT::v2i64)) {
+      std::array<uint64_t, 2> valuearray({0, 0});
+      unsigned i = 0;
+      size_t byteStep = VecT.getScalarType().getStoreSize().getFixedSize();
+      for (const SDValue &Lane : Op->op_values()) {
+        if (IsConstant(Lane)) {
+            using llvm::support::endian::byte_swap;
+            using llvm::support::little;
+            uint8_t* pByteArray = reinterpret_cast<uint8_t*>(valuearray.data());
+            // endianness of the compiler matters here.
+            // little endian has least significant bit first which is advantageous for this.
+            // so a 16 bit integer will be front loaded in a uint64 from byte packing perspective.
+            ConstantSDNode *pConstantSDNode = cast<ConstantSDNode>(Lane.getNode());
+            uint64_t val =  byte_swap(pConstantSDNode->getLimitedValue(), little);
+            uint8_t* pVal = reinterpret_cast<uint8_t*>(&val);
+            std::copy(pVal,pVal+byteStep,pByteArray+i*byteStep);
+        }
+        ++i;
+    }
+    Result = DAG.getSplatBuildVector(MVT::v2i64, DL, DAG.getConstant(valuearray[0], DL, MVT::i64));
+    if (valuearray[0] != valuearray[1]) {
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result, DAG.getConstant(valuearray[1], DL, MVT::i64),
+                           DAG.getConstant(1, DL, MVT::i32));
+    }
+    Result = DAG.getBitcast(VecT, Result);
+    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+      return IsConstant(Lane);
+    };
   }
   if (!Result) {
     // Use a splat, but possibly a load_splat
-- 
2.25.

Awesome, this is a nice simple approach. I will update the code to conform to LLVM style, update the corresponding tests, and get this merged. Thanks for working on this!

Patch up for review here: https://reviews.llvm.org/D88591. While writing tests, I nerd-sniped myself into improving it to only emit a follow-up i64x2.replace_lane after the splat if necessary.

When is a case when CombinedSufficient applies?

For example this test:

define <4 x i32> @emulated_const_combined_sufficient() {
  ret <4 x i32> <i32 1, i32 undef, i32 undef, i32 2>
}

The first half will have the same value as {1, 0} and the second half will have the same value as {0, 2}, so neither half is sufficient to cover the other half naively. But if we take advantage of the fact that undef (or more generally, any lane that is unused in the initial constant) is allowed to take on any value, then we see that we can splat {1, 2} to cover both halves.

Okay... so from the user's perspective, those 2 middle lanes are undefined, but we've populated them with the other's value.

How does one do that from emscripten/em++?

These C functions both end up triggering that case, the first literally with undefs in the LLVM IR and the second because the middle lanes are not constant:

v128_t undef_middle() {
  v128_t v;
  v = wasm_i32x4_replace_lane(v, 0, 1);
  v = wasm_i32x4_replace_lane(v, 3, 2);
  return v;
}

v128_t nonconst_middle(int x, int y) {
  return wasm_i32x4_make(1, x, y, 2);
}

Wow. Fascinating. Kudos.

Do you want to adjust the v128.const code to have that behavior too? Now it's filling those spots with zeros.

How are you thinking of changing the v128.const behavior? I don't think it should matter what non-constant lanes are filled with when emiting a v128.const.

If you want it to match, shouldn't you fill those spots with undefined values? Is that acceptable per the proposal?

WebAssembly doesn't have a concept of an undefined values, so LLVM has to choose some arbitrary value to fill in for undef when emitting WebAssembly. Usually 0 is a reasonable default, so that's what the v128.const code uses (and also what this new code uses). The only reason to use some other value is if doing so has additional benefits, such as reducing the number of instructions emitted like in the CombinedSufficient case in the new code.

Was this page helpful?
0 / 5 - 0 ratings

Related issues

yahsaves picture yahsaves  路  4Comments

napalm272 picture napalm272  路  4Comments

answer1103 picture answer1103  路  4Comments

HolgerStrauss picture HolgerStrauss  路  4Comments

rpellerin picture rpellerin  路  3Comments