@eerhardt @CarolEidt @RussKeldorph
Target Framework netcoreapp2.1
```C#
namespace System.Runtime.Intrinsics.Arm.Arm64
{
///
/// This class provides access to the Arm64 AdvSIMD intrinsics
///
/// Arm64 CPUs indicate support for this feature by setting
/// ID_AA64PFR0_EL1.AdvSIMD == 0 or better.
///
public static class Simd
{
public static bool IsSupported { get { throw null; } }
/// <summary>
/// Vector abs
/// Corresponds to vector forms of ARM64 ABS & FABS
/// </summary>
public static Vector64<byte> Abs(Vector64<sbyte> value) { throw null; }
public static Vector64<ushort> Abs(Vector64<short> value) { throw null; }
public static Vector64<uint> Abs(Vector64<int> value) { throw null; }
public static Vector64<float> Abs(Vector64<float> value) { throw null; }
public static Vector128<byte> Abs(Vector128<sbyte> value) { throw null; }
public static Vector128<ushort> Abs(Vector128<short> value) { throw null; }
public static Vector128<uint> Abs(Vector128<int> value) { throw null; }
public static Vector128<ulong> Abs(Vector128<long> value) { throw null; }
public static Vector128<float> Abs(Vector128<float> value) { throw null; }
public static Vector128<double> Abs(Vector128<double> value) { throw null; }
/// <summary>
/// Vector add
/// Corresponds to vector forms of ARM64 ADD & FADD
/// </summary>
public static Vector64<T> Add<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> Add<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector and
/// Corresponds to vector forms of ARM64 AND
/// </summary>
public static Vector64<T> And<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> And<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector and not
/// Corresponds to vector forms of ARM64 BIC
/// </summary>
public static Vector64<T> AndNot<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> AndNot<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector Divide
/// Corresponds to vector forms of ARM64 FDIV
/// </summary>
public static Vector64<float> Divide(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> Divide(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> Divide(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector max
/// Corresponds to vector forms of ARM64 SMAX, UMAX & FMAX
/// </summary>
public static Vector64<T> Max<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> Max<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector min
/// Corresponds to vector forms of ARM64 SMIN, UMIN & FMIN
/// </summary>
public static Vector64<T> Min<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> Min<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector multiply
///
/// For each element result[elem] = left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MUL & FMUL
/// </summary>
public static Vector64<byte> Multiply(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> Multiply(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> Multiply(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> Multiply(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> Multiply(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> Multiply(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<float> Multiply(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<byte> Multiply(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> Multiply(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> Multiply(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> Multiply(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> Multiply(Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> Multiply(Vector128<int> left, Vector128<int> right) { throw null; }
public static Vector128<float> Multiply(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> Multiply(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector negate
/// Corresponds to vector forms of ARM64 NEG & FNEG
/// </summary>
public static Vector64<sbyte> Negate(Vector64<sbyte> value) { throw null; }
public static Vector64<short> Negate(Vector64<short> value) { throw null; }
public static Vector64<int> Negate(Vector64<int> value) { throw null; }
public static Vector64<float> Negate(Vector64<float> value) { throw null; }
public static Vector128<sbyte> Negate(Vector128<sbyte> value) { throw null; }
public static Vector128<short> Negate(Vector128<short> value) { throw null; }
public static Vector128<int> Negate(Vector128<int> value) { throw null; }
public static Vector128<long> Negate(Vector128<long> value) { throw null; }
public static Vector128<float> Negate(Vector128<float> value) { throw null; }
public static Vector128<double> Negate(Vector128<double> value) { throw null; }
/// <summary>
/// Vector not
/// Corresponds to vector forms of ARM64 NOT
/// </summary>
public static Vector64<T> Not<T>(Vector64<T> value) where T : struct { throw null; }
public static Vector128<T> Not<T>(Vector128<T> value) where T : struct { throw null; }
/// <summary>
/// Vector or
/// Corresponds to vector forms of ARM64 ORR
/// </summary>
public static Vector64<T> Or<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> Or<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector or not
/// Corresponds to vector forms of ARM64 ORN
/// </summary>
public static Vector64<T> OrNot<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> OrNot<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector square root
/// Corresponds to vector forms of ARM64 FRSQRT
/// </summary>
public static Vector64<float> Sqrt(Vector64<float> value) { throw null; }
public static Vector128<float> Sqrt(Vector128<float> value) { throw null; }
public static Vector128<double> Sqrt(Vector128<double> value) { throw null; }
/// <summary>
/// Vector subtract
/// Corresponds to vector forms of ARM64 SUB & FSUB
/// </summary>
public static Vector64<T> Subtract<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> Subtract<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
/// <summary>
/// Vector exclusive or
/// Corresponds to vector forms of ARM64 EOR
/// </summary>
public static Vector64<T> Xor<T>(Vector64<T> left, Vector64<T> right) where T : struct { throw null; }
public static Vector128<T> Xor<T>(Vector128<T> left, Vector128<T> right) where T : struct { throw null; }
}
}
```
The Intel intrinsics tend to use generics for similar methods.
The Intel intrinsics tend to use generics for similar methods.
That is not what I saw in https://github.com/dotnet/coreclr/blob/master/src/mscorlib/src/System/Runtime/Intrinsics/X86/Sse.cs#L18-L41
I should have said similar patterns - if there are overloads for all primitive types, the Intel intrinsics just use generic.
For the Vector64<> methods usually all primitive type except if the size is such that it would not be a vector. Vector64<double>, Vector64<ulong>, and Vector64<long> are essentially nonsensical. I think we should use generics even in this case.
Comments?
cc @4creators
I think we should use generics even in this case.
I agree.
OK I have update to use generics. Will do the same in other proposals.
Abs conceptually doesn't make sense for unsigned types. And currently the return type does not match the argument type.Multiply doesn't support long/ulong vectors.Negate conceptually doesn't make sense for unsigned primitives. (But could treat as signed.Divide and Sqrt only support floating point types.Intel HW intrinsics use pattern like the following in Sse2 (missing float function is implemented in Sse):
```C#
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128i _mm_and_si128 (__m128i a, __m128i b)
///
public static Vector128
///
/// __m128d _mm_and_pd (__m128d a, __m128d b)
///
public static Vector128
If we use pattern like:
```C#
public static Vector128<T> And(Vector128<T> left, Vector128<T> right) => And(left, right);
than we have to artificially create generic constraints due to C# syntax limitations which will limit range of supported generic type parameters to supported types which are a subset of struct constraint. This is currently done for Intel intrinsics in roughly 10 - 20 remaining functions declared without specializations (after dotnet/corefx#15601 is finalized and merged) as follows:
C#
/// <summary>
/// __m256d _mm256_castpd128_pd256 (__m128d a)
/// __m256 _mm256_castps128_ps256 (__m128 a)
/// __m256i _mm256_castsi128_si256 (__m128i a)
/// </summary>
public static Vector256<T> ExtendToVector256<T>(Vector128<T> value) where T : struct
{
ThrowHelper.ThrowNotSupportedExceptionIfNonNumericType<T>();
return ExtendToVector256<T>(value);
}
This leads to breaking the guarantee of compile type safety for type safe languages (C#, VB, F#) what from my perspective is hardly acceptable and is a grave error. I would prefer to write a bit more lines of code providing overloads for every acceptable generic type parameter (it is not that hard) to uphold compile time type safety guarantee expected from all type safe languages: if program compiles without type errors it should never throw type usage related exceptions at runtime. Otherwise we will land in JavaScript situation at least for parts of available libraries and cannot say that DotNet Core is type safe.
This case ideally should be guarded by improved generics syntax (what is difficult to achieve in reasonably short time or at all), less ideally but workable by analyzer and surely not by runtime exceptions.
@4creators,
it is not that hard
Certainly not. The original draft had it that way. @jkotas Asked me to change it to be consistent with Intel.
The decision whether HW Intrinsic should use generics is really outside the scope of this discussion. @CarolEidt plans to update the https://github.com/dotnet/designs/blob/master/accepted/platform-intrinsics.md to handle these types of questions.
I certainly enjoy the type safety and compile time errors given by public static Vector128<double> And(Vector128<double> left, Vector128<double> right).
However, I also don't want to overload the API surface/etc with something that can be trivially handled by an analyzer (which we already know is going to be recommended for other places where enforcing preconditions at compile time is hard or impossible).
That being said, whichever route we go, I would want to see consistent rules (and for those to exist for Arm, x86, and any future architectures or ISAs added).
A good rule might just be always use generics for these APIs. The ISAs (such as SSE) which only support a single type (and will not be changing in the future, because they are locked at this point), can be explicit (Vector128<float> Add), but otherwise (for multiple overloads) we should always have a single API which is generic (Vector128<T> Add()). At the very least, such as rule is clean, consistent, and there won't be any discussions or objections about: "well, the rule was 2, but I had three, and its only one more"
This leads to breaking the guarantee of compile type safety for type safe languages
This does not break type safety. Breaking type safety means allowing a type A to be incorrectly referenced as type B (or similar). Admittedly this is a less than ideal solution, but I gave up arguing for spec changes to provide the appropriate constraints. The explosion of API signatures is, IMO, a non-trivial consideration, and the consensus is to use generics "where it makes sense" (I'm paraphrasing the discussion here).
However, we still need to clarify what it means for it to make sense. In the case where a single type is supported by a given target ISA (e.g. the Vector128<float> Add case mentioned above), it makes sense to "explode" the base types. We probably can't formulate hard and fast rules, but some guidelines can probably be established.
Admittedly this is a less than ideal solution, but I gave up arguing for spec changes to provide the appropriate constraints.
@CarolEidt
Sorry for diverging a bit on main thread subject but I think we are closing very fast to a situation where arguments in favor of improving C# specs and generics implementation in particular may outweigh reluctance of language team to accept them. Second problem which was really painful and resulted in bloated code was writing reasonable abstracted tests for intrinsics. Lack of numeric generic constraints makes C# hard to work with even in the case of simple mathematics without any vectors.
Requests for improved generic constraints besides higher level generics are one of the most often proposed in csharplang repo. IMO it could be a good moment to place a formal language proposal and discuss it again, I can even write Roslyn prototype what in my experience should be relatively easy, to show the estimate of proposed implementation cost. If one of the goals of dotnet is to provide high performance high level language than having improved generics is a must.
It should be mentioned that the intention is that Arm64 AndNot implements left & ~right.
This may be inconsistent with X86 which in the SIMD implementation was choosing ~left & right.
If we choose to use the same name. They need to be consistent.
I personally prefer the proposed Arm64 order.
Similar issue probably exists with OrNot
If we choose to use the same name. They need to be consistent.
I am not sure this is the case. If we had a software fallback that wrapped both, then yes. However, these are hardware intrinsics, and represent specific hardware instructions.
On x86, the documentation for andnps specifies the algorithm is:
DEST[31:0] = (NOT(SRC1[31:0])) BITWISE AND SRC2[31:0]
DEST[63:32] = (NOT(SRC1[63:32])) BITWISE AND SRC2[63:32]
DEST[95:64] = (NOT(SRC1[95:64])) BITWISE AND SRC2[95:64]
DEST[127:96] = (NOT(SRC1[127:96])) BITWISE AND SRC2[127:96]
On ARM, it looks like you use BIC which the documentation specifies is:
bits(datasize) operand1 = X[n];
bits(datasize) operand2 = ShiftReg(m, shift_type, shift_amount);
operand2 = NOT(operand2);
result = operand1 AND operand2;
X[d] = result;
This is a fundamental difference in the architectures but I don't think it is something we should be modifying the intrinsics for.
I find it quite unfortunate that the Intel instruction is called andnps, i.e. "AndNot", but it's really "NotAdd". That said, I tend to agree with @tannergooding that we probably want to keep closer consistency with the target, however in this case I think it would be good to use a different name.
Should the x86 name be updated to NotAnd, to better match what it is doing?
CC. @fiigii.
Should the x86 name be updated to NotAnd, to better match what it is doing?
I would be in favor of that, assuming @fiigii is OK with it.
I think, either way (fix it or leave it), it will cause some confusion to someone.
People used to working with x86 intrinsics will look for AndNot and will generally understand its behavior.
People used to working on ARM may expect the behavior ARM has, so there will be some context switch no matter what (either looking for the new name or understanding the architecture differences).
I think, either way (fix it or leave it), it will cause some confusion to someone.
People used to working with x86 intrinsics will look for AndNot and will generally understand its behavior
I find it quite unfortunate that the Intel instruction is called andnps, i.e. "AndNot", but it's really "NotAdd".
Agree these two statements both...
AndNot is good to help people immigrate from x86 C++ experience to .NET Core.
NotAnd is good to implementing an algorithm for x86 and ARM in .NET Core.
Both are important, so no preference...
IMO there is quite a bit x86 asmcode written both in assembly and with intrinsics and perhaps due to that reason it is better to keep names as they are.
If we clearly point differences between both in documentation and in particular in intellisense via xml docs this should not be a problem for coders. Besides anyone who worked with both arm and x86 intrinsics or assembly is aware of underlying implementation surprises.
@eerhardt perhaps this would be a good time for a corefx person to chime in with their API perspective?
AndNotPrefix would be an option for the X86 prefixed not case.
I don't think there is a 100% right answer here. But I'll give my opinion.
We will have to have some sort of mapping between the C# API and the underlying hardware instruction, right? The way I've been trying out these APIs is to look them up on https://software.intel.com/sites/landingpage/IntrinsicsGuide/, finding the API I'm interested in, and reading about it there.
Or do we imagine that users would be successful just using the C# API and our docs?
If the developer is going to be constantly mapping between the C# API and the underlying intrinsic, it might make sense to keep the C# API close to the underlying intrinsic.
A goal of this feature is to expose the underlying architecture instructions (both their positives and negatives) to C#, right? It's not really a goal to try to make the x86 intrinsics look/act/etc like the ARM intrinsics.
But a typical use case for these methods is something like the following:
C#
if (Sse.IsSupported)
{
Sse.AndNot(left, right);
}
else if (Simd.IsSupported)
{
// some comment about why Simd.AndNot switches the operands from above
Simd.AndNot(right, left);
}
else
{
// software falback
}
It would seem that developers would always need to write that comment about why Simd.AndNot's parameters are switched from Sse.AndNot. So I think giving these functions different names probably makes sense.
If the developer is going to be constantly mapping between the C# API and the underlying intrinsic
It would be really nice if this was not the case.
The only open issue here is distinguishing between the different meanings of AndNot and OrNot
@eerhardt 's Current recommendation is for AndNot / OrNot to have different names.
Proposals for renaming:
NotAnd or AndNotPre for ~A & BAndNot or AndNotPost for A & ~BSimilar naming would be used for ~A | B / A | ~B, ~A ^ B / A ^ ~B ...
I think the naming convention which AndNotPre would allow intellisense to help quickly find the intrinsic w/o making mapping complicated.
Let's bring the naming issue up in the review meeting.
@eerhardt Any reason why this is marked future? The CoreCLR work is complete and ready for 2.1.
Looks good as proposed. Feedback:
Simd should be named AdvSimd0 so that's clear what the name refers to (ARM seems to use bit-patterns to indicate what is support, so it's not a named spec.@terrajobst I think we refined the class name to A64Simd0 in later discussion.
Do we recall why this was changed from AdvSimd0 to A64Simd0?
Ah, I found the related reasoning. https://github.com/dotnet/corefx/issues/26574#issuecomment-418582589
ARM should live in System.Runtime.Intrinsics.Arm, i.e we should merge the Arm namespace with Arm64. We should resolve type conflicts between 32 and 64 by prefixes.
This means we need to differentiate between ARM32 AdvSimd and ARM64 AdvSimd. This would be applicable even if we had them in separate namespaces (i.e. S.R.I.Arm64.AdvSimd0 and S.R.I.Arm32.AdvSimd0) as users will be likely to import both.
This would be partially resolved by the discussion that was had here: https://github.com/dotnet/coreclr/pull/23622#discussion_r270974636. This was trying to determine if we could/should do something similar to x86/x64 and have AdvSimd0 and AdvSimd0.Arm64 since the Arm64 instructions are basically an extension of the Arm32 ones (even if they are considered different architectures).
CC. @TamarChristinaArm, @CarolEidt
@TamarChristinaArm @tannergooding I will work on implementing intrinsics in this PR if no one currently does.
Cool, thanks, There's no conflict with me, I'm still finishing off my other list :)