```c#
class AdvSimd.Arm64
{
///
/// A64: STP Dt1, Dt2, [Xn]
///
public static unsafe void StorePair(byte* address, Vector64
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(double* address, Vector64<double> value1, Vector64<double> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(short* address, Vector64<short> value1, Vector64<short> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(int* address, Vector64<int> value1, Vector64<int> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(long* address, Vector64<long> value1, Vector64<long> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(sbyte* address, Vector64<sbyte> value1, Vector64<sbyte> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(float* address, Vector64<float> value1, Vector64<float> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(ushort* address, Vector64<ushort> value1, Vector64<ushort> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(uint* address, Vector64<uint> value1, Vector64<uint> value2);
/// <summary>
/// A64: STP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePair(ulong* address, Vector64<ulong> value1, Vector64<ulong> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(byte* address, Vector128<byte> value1, Vector128<byte> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(double* address, Vector128<double> value1, Vector128<double> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(short* address, Vector128<short> value1, Vector128<short> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(int* address, Vector128<int> value1, Vector128<int> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(long* address, Vector128<long> value1, Vector128<long> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(sbyte* address, Vector128<sbyte> value1, Vector128<sbyte> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(float* address, Vector128<float> value1, Vector128<float> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(ushort* address, Vector128<ushort> value1, Vector128<ushort> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(uint* address, Vector128<uint> value1, Vector128<uint> value2);
/// <summary>
/// A64: STP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePair(ulong* address, Vector128<ulong> value1, Vector128<ulong> value2);
/// <summary>
/// A64: STP St1, St2, [Xn]
/// </summary>
public static unsafe void StorePairScalar(int* address, Vector64<int> value1, Vector64<int> value2);
/// <summary>
/// A64: STP St1, St2, [Xn]
/// </summary>
public static unsafe void StorePairScalar(float* address, Vector64<float> value1, Vector64<float> value2);
/// <summary>
/// A64: STP St1, St2, [Xn]
/// </summary>
public static unsafe void StorePairScalar(uint* address, Vector64<uint> value1, Vector64<uint> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(byte* address, Vector64<byte> value1, Vector64<byte> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(double* address, Vector64<double> value1, Vector64<double> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(short* address, Vector64<short> value1, Vector64<short> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(long* address, Vector64<long> value1, Vector64<long> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(sbyte* address, Vector64<sbyte> value1, Vector64<sbyte> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(ushort* address, Vector64<ushort> value1, Vector64<ushort> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);
/// <summary>
/// A64: STNP Dt1, Dt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(ulong* address, Vector64<ulong> value1, Vector64<ulong> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(byte* address, Vector128<byte> value1, Vector128<byte> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(double* address, Vector128<double> value1, Vector128<double> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(short* address, Vector128<short> value1, Vector128<short> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(int* address, Vector128<int> value1, Vector128<int> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(long* address, Vector128<long> value1, Vector128<long> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(sbyte* address, Vector128<sbyte> value1, Vector128<sbyte> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(float* address, Vector128<float> value1, Vector128<float> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(ushort* address, Vector128<ushort> value1, Vector128<ushort> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(uint* address, Vector128<uint> value1, Vector128<uint> value2);
/// <summary>
/// A64: STNP Qt1, Qt2, [Xn]
/// </summary>
public static unsafe void StorePairNonTemporal(ulong* address, Vector128<ulong> value1, Vector128<ulong> value2);
/// <summary>
/// A64: STNP St1, St2, [Xn]
/// </summary>
public static unsafe void StorePairScalarNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);
/// <summary>
/// A64: STNP St1, St2, [Xn]
/// </summary>
public static unsafe void StorePairScalarNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);
/// <summary>
/// A64: STNP St1, St2, [Xn]
/// </summary>
public static unsafe void StorePairScalarNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);
}
```
I couldn't add an area label to this Issue.
Checkout this page to find out which area owner to ping, or please add exactly one area label to help train me in the future.
@CarolEidt, @tannergooding , @TamarChristinaArm PTAL
@TamarChristinaArm Is it correct to say that functionality of StorePair on Arm32 can be achieved by using VSTM?
I think we should could have better names than value1/value2 to help clarify the order they are stored in.
I wonder if StoreScalarPair is better than StorePairScalar
I think we should could have better names than
value1/value2to help clarify the order they are stored in.
How about first/second ?
I wonder if
StoreScalarPairis better thanStorePairScalar
It is an option, but I can't tell which one has more emphasis on that we store a pair
I wonder if StoreScalarPair is better than StorePairScalar
It is an option, but I can't tell which one has more emphasis on that we store a pair
It seems to me that it's useful to have all of these start the same, i.e. StorePairXXX
@TamarChristinaArm Is it correct to say that functionality of
StorePairon Arm32 can be achieved by usingVSTM?
Yeah that's correct
Question about these store intrinsics in general, how are you guys planning on dealing with the different addressing modes? or are you only interested in a the register addressing modes?
@TamarChristinaArm
I had a discussion with @BruceForstall where we briefly discussed how we can benefit from post-index addressing modes if we had something like these:
```c#
// ST1 {
void Store(ref T* address, Vector128
For example, if have a loop
```c#
Vector128<double> val ;
for (int i = 0; i < count; i++)
{
/*
compute new value of val
*/
Store(baseAddr + i * 16, val);
}
a user might want to do some sort of strength reduction manually and have this
```c#
Vector128
T* ptr = baseAddr;
for (int i = 0; i < count; i++)
{
/*
compute new value of val
*/
Store(ptr, val);
ptr += 16;
}
and that as a result
```c#
Vector128<double> val ;
T* ptr = baseAddr;
for (int i = 0; i < count; i++)
{
/*
compute new value of val
*/
Store(ref ptr, val);
}
Why can't we just detect and emit the right encoding for baseAddr + i * 16 like we do on x86/x64?
For example, see https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABABgAJiBGAOgCUBXAOwwEt8YaBJFqVp3KzC4A3AFgAUGUq1GLdpx4Y+AobhoANABxJxEycQDM5ZrmwAzGJQBM5AMLkA3pPKvKxgGowwGaFWtaADzmADYQ2BgAfOQAsgAUoeEYAFTk2AAm6VBo5PwYuUzpMAgAlE4ubpXEAOzkAMq4nAAy4elePn4BcRlZ5ADUBUUI5KkoJXqVAL6Sk0A=
We already support optimizing things like:
Sse.LoadVector128(addr + index * 4);
into:
vmovups xmm0, [r8+rax*4]
Why can't we just detect and emit the right encoding for
baseAddr + i * 16like we do on x86/x64?
I did not know that we are doing this on x86/x64. Thes, yes, we can.
I will open an issue to track this work
Why can't we just detect and emit the right encoding for
baseAddr + i * 16like we do on x86/x64?I did not know that we are doing this on x86/x64. Thes, yes, we can.
That would make it easier to do pre-index addressing mode too.
e.g. baseAddr + 16
For example, see https://sharplab.io/#v2:EYLgxg9gTgpgtADwGwBYA0AXEBDAzgWwB8ABABgAJiBGAOgCUBXAOwwEt8YaBJFqVp3KzC4A3AFgAUGUq1GLdpx4Y+AobhoANABxJxEycQDM5ZrmwAzGJQBM5AMLkA3pPKvKxgGowwGaFWtaADzmADYQ2BgAfOQAsgAUoeEYAFTk2AAm6VBo5PwYuUzpMAgAlE4ubpXEAOzkAMq4nAAy4elePn4BcRlZ5ADUBUUI5KkoJXqVAL6Sk0A=
We already support optimizing things like:
Sse.LoadVector128(addr + index * 4);
into:
vmovups xmm0, [r8+rax*4]
Actually, I stand corrected when I said we can do this (well, we can but it's not that easy) - the problem with detecting a post-indexing address mode is harder than what you described on x86/x64, since during writeback stage the instruction modifies the value of a base register.
I don't think we use post-indexing modes anywhere on arm64 other than in hand-written prolog/epilog or cpObj codegen.
It seems like we would want intrinsics to allow directly specifying pre-indexed or post-indexed addressing, due to writeback. I don't think the JIT will be able to optimize that in all cases.
I was wondering the APIs allow specifying a "memcpy" using LD1/ST1 with post-indexing, for example:
LD1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X0], #64
ST1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X1], #64
<loop back>
instead of:
LD1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X0]
ADD X0, X0, #64
ST1 {V0.16B, V1,16B, V2.16B, V3.16B}, [X1]
ADD X1, X1, #64
<loop back>
In simple cases the JIT might be able to optimize this, but we shouldn't necessarily depend on that.
It looks like writeback in this context is the modification of (depending on the post-index overload) Xm (register used for offset), Xn (register used for address), or SP (stack pointer) and not writeback with regards to how the memory is being stored, is that correct?
That is, those inputs are effectively RMW?
It looks like writeback in this context is the modification of (depending on the post-index overload)
Xm(register used for offset),Xn(register used for address), orSP(stack pointer) and not writeback with regards to how the memory is being stored, is that correct?
It's modification of Xn|SP (register used for address) or a base register. Xm doesn't change
Xm doesn't change
Ah, yes, I see. The operation manual has the following and I misread the first if statement
if wback then
if m != 31 then
offs = X[m];
if n == 31 then
SP[] = address + offs;
else
X[n] = address + offs;
Given the vector instructions look to force wback = TRUE, would it make sense for them to simply be: byte* Store(byte* address, Vector128<byte> value)
Then the JIT can more easily determine whether the user intended for address to be mutated?
Oh, nevermind. It's only forced to true for the post index variant not the no offset variant
It seems like we would want intrinsics to allow directly specifying pre-indexed or post-indexed addressing, due to writeback. I don't think the JIT will be able to optimize that in all cases.
Hmmm, but the native intrinsics don't seem to have variants that take anything other than T*, do they just have better analysis for handling/optimizing this?
Given we won't be able to have overloads for things like array[index], is this not something we want to better handle in the JIT anyways?
Hmmm, but the native intrinsics don't seem to have variants that take anything other than T*, do they just have better analysis for handling/optimizing this?
I would suspect that's the case.
Given we won't be able to have overloads for things like array[index], is this not something we want to better handle in the JIT anyways
I don't know why we'd need to have specific overloads for array[index] for this to be useful. In any event the memory hw intrinsics require a pointer, and it would seem that supporting post-indexing of that pointer would be desirable.
I don't know why we'd need to have specific overloads for array[index] for this to be useful
I meant this more as a, post-indexing isn't a concept that is specific to HWIntrinsics, it is a more generally applicable scenario. Because of this, and because there are cases where we can't expose post-indexing specific overloads, would it make sense to just track this as a place we can/should invest further JIT improvements instead?
If we were to expose the post-index variant, do we have an idea of how that would look and how the JIT would need to handle it? I would guess the proposed void Store(ref T* address, Vector128<T> value); has its own disadvantages due to the T* being address taken (effectively a T**)
I think it remains to be seen whether it would be better to expose this directly or to rely on the JIT to optimize it. But I suspect that the difficulties of determining that address is not really address-taken in the presumably frequent case that it's a local var are less than the difficulties of optimizing this. In any event, it would be great to know the extent to which this would be useful.
I think it remains to be seen whether it would be better to expose this directly or to rely on the JIT to optimize it. But I suspect that the difficulties of determining that
addressis not _really_ address-taken in the presumably frequent case that it's a local var are less than the difficulties of optimizing this. In any event, it would be great to know the extent to which this would be useful.
For one or two loads/stores I suspect it shouldn't matter all that much.. when you have a lot of them reading from the same sources or using the same offsets it becomes a bit more of an issue as if you pick the wrong addressing mode you end up with more instructions and higher register pressure. For instance if you fail to recognize that e.g. you can use register offset or an immediate offset you can up using lots of adds to generate the address to use a simpler addressing mode.
LDR for instance has a ton of different addressing modes including the ability to do extensions inside the addressing mode instead of as a different instruction.
That said, C compilers routinely don't use the most efficient addressing modes and it hasn't terribly hurt us yet at this point. On the grand scheme of things there are higher priority optimization tasks but recognizing the simple cases would be a good start I think.
```C#
namespace System.Runtime.Intrinsics.Arm
{
partial class AdvSimd.Arm64
{
public static unsafe void StorePair(byte* address, Vector64
public static unsafe void StorePair(double* address, Vector64
public static unsafe void StorePair(short* address, Vector64
public static unsafe void StorePair(int* address, Vector64
public static unsafe void StorePair(long* address, Vector64
public static unsafe void StorePair(sbyte* address, Vector64
public static unsafe void StorePair(float* address, Vector64
public static unsafe void StorePair(ushort* address, Vector64
public static unsafe void StorePair(uint* address, Vector64
public static unsafe void StorePair(ulong* address, Vector64
public static unsafe void StorePair(byte* address, Vector128
public static unsafe void StorePair(double* address, Vector128
public static unsafe void StorePair(short* address, Vector128
public static unsafe void StorePair(int* address, Vector128
public static unsafe void StorePair(long* address, Vector128
public static unsafe void StorePair(sbyte* address, Vector128
public static unsafe void StorePair(float* address, Vector128
public static unsafe void StorePair(ushort* address, Vector128
public static unsafe void StorePair(uint* address, Vector128
public static unsafe void StorePair(ulong* address, Vector128
public static unsafe void StorePairScalar(int* address, Vector64<int> value1, Vector64<int> value2);
public static unsafe void StorePairScalar(float* address, Vector64<float> value1, Vector64<float> value2);
public static unsafe void StorePairScalar(uint* address, Vector64<uint> value1, Vector64<uint> value2);
public static unsafe void StorePairNonTemporal(byte* address, Vector64<byte> value1, Vector64<byte> value2);
public static unsafe void StorePairNonTemporal(double* address, Vector64<double> value1, Vector64<double> value2);
public static unsafe void StorePairNonTemporal(short* address, Vector64<short> value1, Vector64<short> value2);
public static unsafe void StorePairNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);
public static unsafe void StorePairNonTemporal(long* address, Vector64<long> value1, Vector64<long> value2);
public static unsafe void StorePairNonTemporal(sbyte* address, Vector64<sbyte> value1, Vector64<sbyte> value2);
public static unsafe void StorePairNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);
public static unsafe void StorePairNonTemporal(ushort* address, Vector64<ushort> value1, Vector64<ushort> value2);
public static unsafe void StorePairNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);
public static unsafe void StorePairNonTemporal(ulong* address, Vector64<ulong> value1, Vector64<ulong> value2);
public static unsafe void StorePairNonTemporal(byte* address, Vector128<byte> value1, Vector128<byte> value2);
public static unsafe void StorePairNonTemporal(double* address, Vector128<double> value1, Vector128<double> value2);
public static unsafe void StorePairNonTemporal(short* address, Vector128<short> value1, Vector128<short> value2);
public static unsafe void StorePairNonTemporal(int* address, Vector128<int> value1, Vector128<int> value2);
public static unsafe void StorePairNonTemporal(long* address, Vector128<long> value1, Vector128<long> value2);
public static unsafe void StorePairNonTemporal(sbyte* address, Vector128<sbyte> value1, Vector128<sbyte> value2);
public static unsafe void StorePairNonTemporal(float* address, Vector128<float> value1, Vector128<float> value2);
public static unsafe void StorePairNonTemporal(ushort* address, Vector128<ushort> value1, Vector128<ushort> value2);
public static unsafe void StorePairNonTemporal(uint* address, Vector128<uint> value1, Vector128<uint> value2);
public static unsafe void StorePairNonTemporal(ulong* address, Vector128<ulong> value1, Vector128<ulong> value2);
public static unsafe void StorePairScalarNonTemporal(int* address, Vector64<int> value1, Vector64<int> value2);
public static unsafe void StorePairScalarNonTemporal(float* address, Vector64<float> value1, Vector64<float> value2);
public static unsafe void StorePairScalarNonTemporal(uint* address, Vector64<uint> value1, Vector64<uint> value2);
}
}
```
@TamarChristinaArm Is it correct to say that functionality of
StorePairon Arm32 can be achieved by usingVSTM?Yeah that's correct
@TamarChristinaArm I started implementing StorePair and I realized that my original statement above was wrong since VSTM only allows to store from a list of consecutively numbered D-registers while STP can store an arbitrary pair of registers, so they are not equivalent. I believe the intrinsics in this PR should be Arm64 only. Do you agree? Out of curiosity - why there is no C++ intrinsics that store a pair of SIMD/FP registers?
@echesakovMSFT
@TamarChristinaArm I started implementing StorePair and I realized that my original statement above was wrong since VSTM only allows to store from a list of consecutively numbered D-registers while STP can store an arbitrary pair of registers, so they are not equivalent. I believe the intrinsics in this PR should be Arm64 only. Do you agree?
Well sort of, VSTM allows D and S registers. By the overlap of the register file they also allow Q registers. i.e. VSTM r0, {Q0, Q1} is just VSTM r0, {d0-d3} and GAS allows this as a programmer convenience instruction.
So the VSTM functionally allows you to do everything you can with an STP. Whether the registers are consecutive or not really is a RA issue...
But yes, in the context of not having convenience intrinsics in CoreCLR I agree that it needs to be different intrinsics. (In case you're wondering, in C we would have done this in AArch32 by putting the values in a struct in the definition of the intrinsics before expanding STM. This usually wouldn't produce any extra moves as the RA will arrange if possible the values immediately in the right registers and the struct is optimized away).
Out of curiosity - why there is no C++ intrinsics that store a pair of SIMD/FP registers?
The belief is that you don't need them and that the compiler should always be able to form pairs when it's possible. To do this both LLVM and GCC have special passes that aid in this. In GCC for instance we have a scheduler fusion pass that allows the instruction scheduler to move consecutive loads and stores next to each other if the pipeline description says it makes sense based on the data dependencies etc.
i.e. we won't move them if you can't form pairs so that you don't overload your pipelines with a long chain of load/stores. After this we peephole them. After this we have a late scheduling pass that is able to schedule the formed pairs better so that again you don't have a long chain of them in your pipeline.
Another way it deals with this is that we have modes that are larger than a machine int register size. e.g. TImode is 128 bits, OImode is 256 etc. The actual machine registers can be declared as a subregister of these larger modes in RTL(IR in LLVM), so that RA gives you consecutive registers and we can split the values later (into pairs if possible).
@TamarChristinaArm Thank you for your thorough reply!
@CarolEidt I am wondering whether the optimization Tamar describes is feasible in RyuJIT?
RyuJIT doesn't have a scheduling pass, nor do we have any peephole-like phases that, for example, use a sliding window of instructions to analyze for optimizations such as this. Not to mention that we only have very limited capability of doing dependence analysis to identify interfering memory operations. So the only near-term feasible optimization would be for immediately adjacent instructions.
Most helpful comment
RyuJIT doesn't have a scheduling pass, nor do we have any peephole-like phases that, for example, use a sliding window of instructions to analyze for optimizations such as this. Not to mention that we only have very limited capability of doing dependence analysis to identify interfering memory operations. So the only near-term feasible optimization would be for immediately adjacent instructions.