@tannergooding: Updated according to match https://github.com/dotnet/corefx/issues/26581#issuecomment-539217015. Previous version is available in comment history.
```C#
namespace System.Runtime.Intrinsics.Arm
{
public static class AdvSimd
{
public static bool IsSupported { get { throw null; } }
/// <summary>
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
/// </summary>
public static Vector64<float> AbsoluteCompareGreaterThanOrEqual(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> AbsoluteCompareGreaterThanOrEqual(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector64<float> AbsoluteCompareGreaterThan(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> AbsoluteCompareGreaterThan(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD & FABD
/// </summary>
public static Vector64<byte> AbsoluteDifference(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<byte> AbsoluteDifference(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> AbsoluteDifference(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<ushort> AbsoluteDifference(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> AbsoluteDifference(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<uint> AbsoluteDifference(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<float> AbsoluteDifference(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<byte> AbsoluteDifference(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<byte> AbsoluteDifference(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<ushort> AbsoluteDifference(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> AbsoluteDifference(Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<uint> AbsoluteDifference(Vector128<int> left, Vector128<int> right) { throw null; }
public static Vector128<float> AbsoluteDifference(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector absolute difference add
///
/// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
///
/// Corresponds to vector forms of ARM64 SABA, UABA
/// </summary>
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<byte> acc, Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<sbyte> acc, Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<ushort> acc, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<short> acc, Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<uint> acc, Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<int> acc, Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<byte> acc, Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<sbyte> acc, Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short> acc, Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<uint> acc, Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<int> acc, Vector128<int> left, Vector128<int> right) { throw null; }
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP & FADDP
/// </summary>
public static Vector64<byte> AddPairwise<byte>(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> AddPairwise<sbyte>(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> AddPairwise<ushort>(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> AddPairwise<short>(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<int> AddPairwise<int>(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<uint> AddPairwise<uint>(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<float> AddPairwise<float>(Vector64<float> left, Vector64<float> right) { throw null; }
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector64<byte> ExtractVector<byte>(Vector64<byte> left, Vector64<byte> right, byte index) { throw null; }
public static Vector64<sbyte> ExtractVector<sbyte>(Vector64<sbyte> left, Vector64<sbyte> right, byte index) { throw null; }
public static Vector64<short> ExtractVector<short>(Vector64<short> left, Vector64<short> right, byte index) { throw null; }
public static Vector64<ushort> ExtractVector<ushort>(Vector64<ushort> left, Vector64<ushort> right, byte index) { throw null; }
public static Vector64<int> ExtractVector<int>(Vector64<int> left, Vector64<int> right, byte index) { throw null; }
public static Vector64<uint> ExtractVector<uint>(Vector64<uint> left, Vector64<uint> right, byte index) { throw null; }
public static Vector128<byte> ExtractVector<byte>(Vector128<byte> left, Vector128<byte> right, byte index) { throw null; }
public static Vector128<sbyte> ExtractVector<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right, byte index) { throw null; }
public static Vector128<short> ExtractVector<short>(Vector128<short> left, Vector128<short> right, byte index) { throw null; }
public static Vector128<ushort> ExtractVector<ushort>(Vector128<ushort> left, Vector128<ushort> right, byte index) { throw null; }
public static Vector128<int> ExtractVector<int>(Vector128<int> left, Vector128<int> right, byte index) { throw null; }
public static Vector128<uint> ExtractVector<uint>(Vector128<uint> left, Vector128<uint> right, byte index) { throw null; }
public static Vector128<long> ExtractVector<long>(Vector128<long> left, Vector128<long> right, byte index) { throw null; }
public static Vector128<ulong> ExtractVector<ulong>(Vector128<ulong> left, Vector128<ulong> right, byte index) { throw null; }
public static Vector128<float> ExtractVector<double>(Vector128<float> left, Vector128<float> right, byte index) { throw null; }
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector64<float> MaxNumeric(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MaxNumeric(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
/// </summary>
public static Vector64<byte> MaxPairwise(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MaxPairwise(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MaxPairwise(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MaxPairwise(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MaxPairwise(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MaxPairwise(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<float> MaxPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
/// <summary>
/// Vector min numeric
/// Corresponds to vector forms of ARM64 FMINNM
/// </summary>
public static Vector64<float> MinNumeric(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MinNumeric(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector min pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
/// </summary>
public static Vector64<byte> MinPairwise(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MinPairwise(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MinPairwise(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MinPairwise(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MinPairwise(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MinPairwise(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<float> MinPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
/// <summary>
/// Vector multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAdd(Vector64<byte> acc, Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MultiplyAdd(Vector64<sbyte> acc, Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MultiplyAdd(Vector64<ushort> acc, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MultiplyAdd(Vector64<short> acc, Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MultiplyAdd(Vector64<uint> acc, Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MultiplyAdd(Vector64<int> acc, Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector128<byte> MultiplyAdd(Vector128<byte> acc, Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MultiplyAdd(Vector128<sbyte> acc, Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MultiplyAdd(Vector128<short> acc, Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MultiplyAdd(Vector128<uint> acc, Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MultiplyAdd(Vector128<int> acc, Vector128<int> left, Vector128<int> right) { throw null; }
/// <summary>
/// Vector multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAdd(Vector64<byte> acc, Vector64<byte> left, byte right) { throw null; }
public static Vector64<sbyte> MultiplyAdd(Vector64<sbyte> acc, Vector64<sbyte> left, sbyte right) { throw null; }
public static Vector64<ushort> MultiplyAdd(Vector64<ushort> acc, Vector64<ushort> left, ushort right) { throw null; }
public static Vector64<short> MultiplyAdd(Vector64<short> acc, Vector64<short> left, short right) { throw null; }
public static Vector64<uint> MultiplyAdd(Vector64<uint> acc, Vector64<uint> left, uint right) { throw null; }
public static Vector64<int> MultiplyAdd(Vector64<int> acc, Vector64<int> left, int right) { throw null; }
public static Vector128<byte> MultiplyAdd(Vector128<byte> acc, Vector128<byte> left, byte right) { throw null; }
public static Vector128<sbyte> MultiplyAdd(Vector128<sbyte> acc, Vector128<sbyte> left, sbyte right) { throw null; }
public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, ushort right) { throw null; }
public static Vector128<short> MultiplyAdd(Vector128<short> acc, Vector128<short> left, short right) { throw null; }
public static Vector128<uint> MultiplyAdd(Vector128<uint> acc, Vector128<uint> left, uint right) { throw null; }
public static Vector128<int> MultiplyAdd(Vector128<int> acc, Vector128<int> left, int right) { throw null; }
/// <summary>
/// Vector multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtract(Vector64<byte> acc, Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MultiplySubtract(Vector64<sbyte> acc, Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MultiplySubtract(Vector64<ushort> acc, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MultiplySubtract(Vector64<short> acc, Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MultiplySubtract(Vector64<uint> acc, Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MultiplySubtract(Vector64<int> acc, Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector128<byte> MultiplySubtract(Vector128<byte> acc, Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MultiplySubtract(Vector128<sbyte> acc, Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MultiplySubtract(Vector128<short> acc, Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MultiplySubtract(Vector128<uint> acc, Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MultiplySubtract(Vector128<int> acc, Vector128<int> left, Vector128<int> right) { throw null; }
/// <summary>
/// Vector multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtract(Vector64<byte> acc, Vector64<byte> left, byte right) { throw null; }
public static Vector64<sbyte> MultiplySubtract(Vector64<sbyte> acc, Vector64<sbyte> left, sbyte right) { throw null; }
public static Vector64<ushort> MultiplySubtract(Vector64<ushort> acc, Vector64<ushort> left, ushort right) { throw null; }
public static Vector64<short> MultiplySubtract(Vector64<short> acc, Vector64<short> left, short right) { throw null; }
public static Vector64<uint> MultiplySubtract(Vector64<uint> acc, Vector64<uint> left, uint right) { throw null; }
public static Vector64<int> MultiplySubtract(Vector64<int> acc, Vector64<int> left, int right) { throw null; }
public static Vector128<byte> MultiplySubtract(Vector128<byte> acc, Vector128<byte> left, byte right) { throw null; }
public static Vector128<sbyte> MultiplySubtract(Vector128<sbyte> acc, Vector128<sbyte> left, sbyte right) { throw null; }
public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, ushort right) { throw null; }
public static Vector128<short> MultiplySubtract(Vector128<short> acc, Vector128<short> left, short right) { throw null; }
public static Vector128<uint> MultiplySubtract(Vector128<uint> acc, Vector128<uint> left, uint right) { throw null; }
public static Vector128<int> MultiplySubtract(Vector128<int> acc, Vector128<int> left, int right) { throw null; }
/// <summary>
/// Vector fused multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> FusedMultiplyAdd(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector fused multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector64<float> FusedMultiplySubtract(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> FusedMultiplySubtract(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector polynomial multiply
/// Corresponds to vector forms of ARM64 PMUL
/// </summary>
public static Vector64<byte> PolynomialMultiply(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> PolynomialMultiply(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector128<byte> PolynomialMultiply(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> PolynomialMultiply(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
/// Vector reciprocal estimate
///
/// See FRECPE docs
///
/// Corresponds to vector forms of ARM64 FRECPE
/// </summary>
public static Vector64<float> ReciprocalEstimate(Vector64<float> value) { throw null; }
public static Vector128<float> ReciprocalEstimate(Vector128<float> value) { throw null; }
/// <summary>
/// Vector reciprocal step
///
/// See FRECPS docs
///
/// Corresponds to vector forms of ARM64 FRECPS
/// </summary>
public static Vector64<float> ReciprocalStep(Vector64<float> left, Vector64<float> right, byte index) { throw null; }
public static Vector128<float> ReciprocalStep(Vector128<float> left, Vector128<float> right, byte index) { throw null; }
/// <summary>
/// Vector reciprocal square root estimate
///
/// See FRSQRTE docs
///
/// Corresponds to vector forms of ARM64 FRSQRTE
/// </summary>
public static Vector64<float> ReciprocalSquareRootEstimate(Vector64<float> value) { throw null; }
public static Vector128<float> ReciprocalSquareRootEstimate(Vector128<float> value) { throw null; }
/// <summary>
/// Vector reciprocal square root step
///
/// See FRSQRTS docs
///
/// Corresponds to vector forms of ARM64 FRSQRTS
/// </summary>
public static Vector64<float> ReciprocalSquareRootEstimate(Vector64<float> left, Vector64<float> right, byte index) { throw null; }
public static Vector128<float> ReciprocalSquareRootEstimate(Vector128<float> left, Vector128<float> right, byte index) { throw null; }
/// <summary>
/// Vector reverse element bytes
/// Corresponds to vector forms of ARM64 REV16, REV32, REV64
/// </summary>
public static Vector64<ushort> ReverseElementBytes(Vector64<ushort> value) { throw null; }
public static Vector64<short> ReverseElementBytes(Vector64<short> value) { throw null; }
public static Vector64<uint> ReverseElementBytes(Vector64<uint> value) { throw null; }
public static Vector64<int> ReverseElementBytes(Vector64<int> value) { throw null; }
public static Vector64<float> ReverseElementBytes(Vector64<float> value) { throw null; }
public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
public static Vector128<short> ReverseElementBytes(Vector128<short> value) { throw null; }
public static Vector128<uint> ReverseElementBytes(Vector128<uint> value) { throw null; }
public static Vector128<int> ReverseElementBytes(Vector128<int> value) { throw null; }
public static Vector128<ulong> ReverseElementBytes(Vector128<ulong> value) { throw null; }
public static Vector128<long> ReverseElementBytes(Vector128<long> value) { throw null; }
public static Vector128<float> ReverseElementBytes(Vector128<float> value) { throw null; }
public static class Arm32
{
public static bool IsSupported { get { throw null; } }
/// <summary>
/// Vector multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM32 VMLA
/// </summary>=
public static Vector64<float> MultiplyAdd(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MultiplyAdd(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM32 VMLA
/// </summary>
public static Vector64<float> MultiplyAdd(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> MultiplyAdd(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
/// <summary>
/// Vector multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM32 VMLS
/// </summary>
public static Vector64<float> MultiplySubtract(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MultiplySubtract(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM32 VMLS
/// </summary>
public static Vector64<float> MultiplySubtract(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> MultiplySubtract(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
}
public static class Arm64
{
public static bool IsSupported { get { throw null; } }
/// <summary>
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD & FABD
/// </summary>
public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP & FADDP
/// </summary>
public static Vector128<T> AddPairwise<byte>(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<T> AddPairwise<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<T> AddPairwise<ushort>(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<T> AddPairwise<short>(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<long> AddPairwise<long>(Vector128<long> left, Vector128<long> right) { throw null; }
public static Vector128<ulong> AddPairwise<ulong>(Vector128<ulong> left, Vector128<ulong> right) { throw null; }
public static Vector128<T> AddPairwise<float>(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<T> AddPairwise<double>(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector128<double> ExtractVector<double>(Vector128<double> left, Vector128<double> right, byte index) { throw null; }
/// <summary>
/// Vector add across vector elements
/// Corresponds to vector forms of ARM64 ADDV
/// </summary>
public static byte AddAcross(Vector64<byte> value) { throw null; }
public static sbyte AddAcross(Vector64<sbyte> value) { throw null; }
public static ushort AddAcross(Vector64<ushort> value) { throw null; }
public static short AddAcross(Vector64<short> value) { throw null; }
public static uint AddAcross(Vector64<uint> value) { throw null; }
public static int AddAcross(Vector64<int> value) { throw null; }
public static byte AddAcross(Vector128<byte> value) { throw null; }
public static sbyte AddAcross(Vector128<sbyte> value) { throw null; }
public static ushort AddAcross(Vector128<ushort> value) { throw null; }
public static short AddAcross(Vector128<short> value) { throw null; }
public static uint AddAcross(Vector128<uint> value) { throw null; }
public static int AddAcross(Vector128<int> value) { throw null; }
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector128<double> MaxNumeric(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector max numeric pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 FMAXNMP
/// </summary>
public static Vector64<float> MaxNumericPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MaxNumericPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector max numeric across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 FMAXNMV
/// </summary>
public static float MaxNumericAcross(Vector128<float> value) { throw null; }
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
/// </summary>
public static Vector128<byte> MaxPairwise(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MaxPairwise(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MaxPairwise(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MaxPairwise(Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MaxPairwise(Vector128<int> left, Vector128<int> right) { throw null; }
public static Vector128<float> MaxPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector max across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 SMAXV, UMAXV & FMAXV
/// </summary>
public static byte MaxAcross(Vector64<byte> value) { throw null; }
public static sbyte MaxAcross(Vector64<sbyte> value) { throw null; }
public static ushort MaxAcross(Vector64<ushort> value) { throw null; }
public static short MaxAcross(Vector64<short> value) { throw null; }
public static uint MaxAcross(Vector64<uint> value) { throw null; }
public static int MaxAcross(Vector64<int> value) { throw null; }
public static float MaxAcross(Vector64<float> value) { throw null; }
public static byte MaxAcross(Vector128<byte> value) { throw null; }
public static sbyte MaxAcross(Vector128<sbyte> value) { throw null; }
public static ushort MaxAcross(Vector128<ushort> value) { throw null; }
public static short MaxAcross(Vector128<short> value) { throw null; }
public static uint MaxAcross(Vector128<uint> value) { throw null; }
public static int MaxAcross(Vector128<int> value) { throw null; }
public static ulong MaxAcross(Vector128<ulong> value) { throw null; }
public static long MaxAcross(Vector128<long> value) { throw null; }
public static float MaxAcross(Vector128<float> value) { throw null; }
public static double MaxAcross(Vector128<double> value) { throw null; }
/// <summary>
/// Vector min numeric
/// Corresponds to vector forms of ARM64 FMINNM
/// </summary>
public static Vector128<double> MinNumeric(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector min numeric pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 FMINNMP
/// </summary>
public static Vector64<float> MaxNumericPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MaxNumericPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector min numeric across
///
/// result = min(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 FMINNMV
/// </summary>
public static float MaxNumericAcross(Vector128<float> value) { throw null; }
/// <summary>
/// Vector min pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
/// </summary>
public static Vector128<byte> MinPairwise(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MinPairwise(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MinPairwise(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MinPairwise(Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MinPairwise(Vector128<int> left, Vector128<int> right) { throw null; }
public static Vector128<float> MinPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector min across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 SMINV, UMINV & FMINV
/// </summary>
public static byte MinAcross(Vector64<byte> value) { throw null; }
public static sbyte MinAcross(Vector64<sbyte> value) { throw null; }
public static ushort MinAcross(Vector64<ushort> value) { throw null; }
public static short MinAcross(Vector64<short> value) { throw null; }
public static uint MinAcross(Vector64<uint> value) { throw null; }
public static int MinAcross(Vector64<int> value) { throw null; }
public static float MinAcross(Vector64<float> value) { throw null; }
public static byte MinAcross(Vector128<byte> value) { throw null; }
public static sbyte MinAcross(Vector128<sbyte> value) { throw null; }
public static ushort MinAcross(Vector128<ushort> value) { throw null; }
public static short MinAcross(Vector128<short> value) { throw null; }
public static uint MinAcross(Vector128<uint> value) { throw null; }
public static int MinAcross(Vector128<int> value) { throw null; }
public static float MinAcross(Vector128<float> value) { throw null; }
public static double MinAcross(Vector128<double> value) { throw null; }
/// <summary>
/// Vector fused multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector128<double> FusedMultiplyAdd(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector fused multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> FusedMultiplyAdd(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
public static Vector128<float> FusedMultiplyAdd(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
/// <summary>
/// Vector fused multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector fused multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector64<float> FusedMultiplySubtract(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> FusedMultiplySubtract(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, float right) { throw null; }
/// <summary>
/// Vector multiply extend
///
/// For each element result[elem] = left[elem] * right[elem]
/// Handle extend special cases zero and infinite. FMULX
///
/// Corresponds to vector forms of ARM64 FMULX
/// </summary>
public static Vector64<float> MultiplyExtend(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MultiplyExtend(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MultiplyExtend(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector multiply extend by element
///
/// For each element result[elem] = left[elem] * right
/// Handle extend special cases zero and infinite. FMULX
///
/// Corresponds to vector forms of ARM64 FMULX
/// </summary>
public static Vector64<float> MultiplyExtend(Vector64<float> left, float right) { throw null; }
public static Vector128<float> MultiplyExtend(Vector128<float> left, float right) { throw null; }
public static Vector128<double> MultiplyExtend(Vector128<double> left, double right) { throw null; }
/// Vector reciprocal estimate
///
/// See FRECPE docs
///
/// Corresponds to vector forms of ARM64 FRECPE
/// </summary>
public static Vector128<double> ReciprocalEstimate(Vector128<double> value) { throw null; }
/// <summary>
/// Vector reciprocal step
///
/// See FRECPS docs
///
/// Corresponds to vector forms of ARM64 FRECPS
/// </summary>
public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right, byte index) { throw null; }
/// <summary>
/// Vector reciprocal square root estimate
///
/// See FRSQRTE docs
///
/// Corresponds to vector forms of ARM64 FRSQRTE
/// </summary>
public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value) { throw null; }
/// <summary>
/// Vector reciprocal square root step
///
/// See FRSQRTS docs
///
/// Corresponds to vector forms of ARM64 FRSQRTS
/// </summary>
public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> left, Vector128<double> right, byte index) { throw null; }
/// <summary>
/// Vector reverse byte bits
/// Corresponds to vector forms of ARM64 RBIT
/// </summary>
public static Vector64<byte> ReverseElementBits(Vector64<byte> value) { throw null; }
public static Vector64<sbyte> ReverseElementBits(Vector64<sbyte> value) { throw null; }
public static Vector128<byte> ReverseElementBits(Vector128<byte> value) { throw null; }
public static Vector128<sbyte> ReverseElementBits(Vector128<sbyte> value) { throw null; }
}
}
}
```
This is the next wave of SIMD instructions which I plan to implement
@CarolEidt @RussKeldorph @eerhardt PTAL
@tannergooding @4creators @fiigii @dotnet/arm64-contrib @dotnet/jit-contrib
This introduces more complicated intrinsic overloads. For instance
Multiply(Vector64<float>, Vector64<float>)
Multiply(Vector64<float>, float)
This introduces some implementation complexity. lookupHWIntrinsic
must check Method arguments.
It may make the API less safe.
The second form represents multiply by vector element. It could be renamed to
MultiplyByElement(Vector64<float>, float)
There are other cases Extract
, MultiplyAdd
...
Opinions?
Looks like X86 intrinsics is using MultiplyScalar(Vector64<float>, float)
for Arm64's
MultiplyByElement(Vector64<float>, float)
. I'm OK with renaming.
Looks like X86 intrinsics is using MultiplyScalar(Vector64
, float) for Arm64's
MultiplyByElement(Vector64, float). I'm OK with renaming.
X86 does not have MultiplyScalar(Vector64<float>, float)
. In Intel HW intrinsics, Scalar
means operating over Vector128<T>
but only computing the first element.
Hi, I'm wondering what the status of this proposal is?
@TamarChristinaArm When I moved to Microsoft, I stopped championing this. In my opinion it was in good shape when I left it.
The proposal is probably a little out of date. When some of the other intrinsic API were approved, the namespace changed.
If someone was motivated to implement this and we had consensus, the next step would be to mark this as API ready for review and have an API design review.
In my opinion it was in good shape when I left it.
@sdmaclea I agree. I have some of this implemented as I was going off a different list, but It would be best just to get these approved.
The namespace change shouldn't affect this much so I think it's fine to review as is. The other approved APIs need slight changes due to the namespaces too but it's easier to have them all approved as is (as it's mostly about the intrinsics themselves no? the namespace they end up in is determined by the ISA).
If someone was motivated to implement this and we had consensus, the next step would be to mark this as API ready for review and have an API design review.
How should I go about this? do I just add the label?
I doubt you can add the label. I'll add it. I just tried but GitHub seems to have issues at the moment.
The namespace change shouldn't affect this much so I think it's fine to review as is.
Part of this is trivial and just involves updating the class name and namespace name.
The more difficult part comes from pulling out what is shared vs what is ARM64 specific; which needs to be done anyways.
I'm fine with marking this ready-for-review, but I'd like to see us get the proposal updated before it is reviewed, if possible. It tends to make the entire process easier and is ultimately part of implementing it anyways.
@terrajobst Can we schedule this for API review? Can we include Arm on the call?
@tannergooding @TamarChristinaArm My head is not in this space at the moment. I am happy to play admin, but I can't drive this.
If you comment on what needs changed, I am happy to update.
Either way marking ready for review seems fine. It will take at least a week to schedule the review. We should clean up as quickly as practical.
I'll post the changes require to adhere to dotnet/corefx#37199 today. I've already started on it.
If you comment on what needs changed, I am happy to update.
I can modify the original comment on any of these and you shouldn't need to worry about it 馃槃
I can also handle ensuring this gets a slot on the API review schedule, etc.
Extract
in this proposal uses the same name for the intrinsics as in dotnet/runtime#24588, there's no clash because the overloads are different but they are completely different intrinsics. Should this one instead be something like ExtractVector
?
I think ExtractVector
makes sense, given my understanding of the API.
new list below
changes:
Multiply
(already implemented in dotnet/runtime#24588)Extract
to ExtractVector
AddPairwise
away from genericsExtractVector
away from genericsMaxNumericAcross
.FRECPX
as that has no vector versionsA64
only intrinsicsI believe this is the full rewritten list, I've asked a question internally about MLA
and will update this tomorrow if needed when I get an answer.
namespace System.Runtime.Intrinsics.Arm
{
public static class Simd
{
public static bool IsSupported { get { throw null; } }
/// <summary>
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
/// </summary>
public static Vector64<float> AbsoluteCompareGreaterThanOrEqual(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> AbsoluteCompareGreaterThanOrEqual(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector64<float> AbsoluteCompareGreaterThan(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> AbsoluteCompareGreaterThan(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD & FABD
/// </summary>
public static Vector64<byte> AbsoluteDifference(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<byte> AbsoluteDifference(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> AbsoluteDifference(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<ushort> AbsoluteDifference(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> AbsoluteDifference(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<uint> AbsoluteDifference(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<float> AbsoluteDifference(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<byte> AbsoluteDifference(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<byte> AbsoluteDifference(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<ushort> AbsoluteDifference(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> AbsoluteDifference(Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<uint> AbsoluteDifference(Vector128<int> left, Vector128<int> right) { throw null; }
public static Vector128<float> AbsoluteDifference(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector absolute difference add
///
/// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
///
/// Corresponds to vector forms of ARM64 SABA, UABA
/// </summary>
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<byte> acc, Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<sbyte> acc, Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<ushort> acc, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<short> acc, Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<uint> acc, Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<int> acc, Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<byte> acc, Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<sbyte> acc, Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short> acc, Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<uint> acc, Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<int> acc, Vector128<int> left, Vector128<int> right) { throw null; }
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP & FADDP
/// </summary>
public static Vector64<byte> AddPairwise<byte>(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> AddPairwise<sbyte>(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> AddPairwise<ushort>(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> AddPairwise<short>(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<int> AddPairwise<int>(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<uint> AddPairwise<uint>(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<float> AddPairwise<float>(Vector64<float> left, Vector64<float> right) { throw null; }
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector64<byte> ExtractVector<byte>(Vector64<byte> left, Vector64<byte> right, byte index) { throw null; }
public static Vector64<sbyte> ExtractVector<sbyte>(Vector64<sbyte> left, Vector64<sbyte> right, byte index) { throw null; }
public static Vector64<short> ExtractVector<short>(Vector64<short> left, Vector64<short> right, byte index) { throw null; }
public static Vector64<ushort> ExtractVector<ushort>(Vector64<ushort> left, Vector64<ushort> right, byte index) { throw null; }
public static Vector64<int> ExtractVector<int>(Vector64<int> left, Vector64<int> right, byte index) { throw null; }
public static Vector64<uint> ExtractVector<uint>(Vector64<uint> left, Vector64<uint> right, byte index) { throw null; }
public static Vector128<byte> ExtractVector<byte>(Vector128<byte> left, Vector128<byte> right, byte index) { throw null; }
public static Vector128<sbyte> ExtractVector<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right, byte index) { throw null; }
public static Vector128<short> ExtractVector<short>(Vector128<short> left, Vector128<short> right, byte index) { throw null; }
public static Vector128<ushort> ExtractVector<ushort>(Vector128<ushort> left, Vector128<ushort> right, byte index) { throw null; }
public static Vector128<int> ExtractVector<int>(Vector128<int> left, Vector128<int> right, byte index) { throw null; }
public static Vector128<uint> ExtractVector<uint>(Vector128<uint> left, Vector128<uint> right, byte index) { throw null; }
public static Vector128<long> ExtractVector<long>(Vector128<long> left, Vector128<long> right, byte index) { throw null; }
public static Vector128<ulong> ExtractVector<ulong>(Vector128<ulong> left, Vector128<ulong> right, byte index) { throw null; }
public static Vector128<float> ExtractVector<double>(Vector128<float> left, Vector128<float> right, byte index) { throw null; }
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector64<float> MaxNumeric(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MaxNumeric(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
/// </summary>
public static Vector64<byte> MaxPairwise(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MaxPairwise(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MaxPairwise(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MaxPairwise(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MaxPairwise(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MaxPairwise(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<float> MaxPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
/// <summary>
/// Vector min numeric
/// Corresponds to vector forms of ARM64 FMINNM
/// </summary>
public static Vector64<float> MinNumeric(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MinNumeric(Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector min pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
/// </summary>
public static Vector64<byte> MinPairwise(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MinPairwise(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MinPairwise(Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MinPairwise(Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MinPairwise(Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MinPairwise(Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector64<float> MinPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
/// <summary>
/// Vector multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAdd(Vector64<byte> acc, Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MultiplyAdd(Vector64<sbyte> acc, Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MultiplyAdd(Vector64<ushort> acc, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MultiplyAdd(Vector64<short> acc, Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MultiplyAdd(Vector64<uint> acc, Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MultiplyAdd(Vector64<int> acc, Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector128<byte> MultiplyAdd(Vector128<byte> acc, Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MultiplyAdd(Vector128<sbyte> acc, Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MultiplyAdd(Vector128<short> acc, Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MultiplyAdd(Vector128<uint> acc, Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MultiplyAdd(Vector128<int> acc, Vector128<int> left, Vector128<int> right) { throw null; }
/// <summary>
/// Vector multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAdd(Vector64<byte> acc, Vector64<byte> left, byte right) { throw null; }
public static Vector64<sbyte> MultiplyAdd(Vector64<sbyte> acc, Vector64<sbyte> left, sbyte right) { throw null; }
public static Vector64<ushort> MultiplyAdd(Vector64<ushort> acc, Vector64<ushort> left, ushort right) { throw null; }
public static Vector64<short> MultiplyAdd(Vector64<short> acc, Vector64<short> left, short right) { throw null; }
public static Vector64<uint> MultiplyAdd(Vector64<uint> acc, Vector64<uint> left, uint right) { throw null; }
public static Vector64<int> MultiplyAdd(Vector64<int> acc, Vector64<int> left, int right) { throw null; }
public static Vector128<byte> MultiplyAdd(Vector128<byte> acc, Vector128<byte> left, byte right) { throw null; }
public static Vector128<sbyte> MultiplyAdd(Vector128<sbyte> acc, Vector128<sbyte> left, sbyte right) { throw null; }
public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, ushort right) { throw null; }
public static Vector128<short> MultiplyAdd(Vector128<short> acc, Vector128<short> left, short right) { throw null; }
public static Vector128<uint> MultiplyAdd(Vector128<uint> acc, Vector128<uint> left, uint right) { throw null; }
public static Vector128<int> MultiplyAdd(Vector128<int> acc, Vector128<int> left, int right) { throw null; }
/// <summary>
/// Vector multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtract(Vector64<byte> acc, Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> MultiplySubtract(Vector64<sbyte> acc, Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector64<ushort> MultiplySubtract(Vector64<ushort> acc, Vector64<ushort> left, Vector64<ushort> right) { throw null; }
public static Vector64<short> MultiplySubtract(Vector64<short> acc, Vector64<short> left, Vector64<short> right) { throw null; }
public static Vector64<uint> MultiplySubtract(Vector64<uint> acc, Vector64<uint> left, Vector64<uint> right) { throw null; }
public static Vector64<int> MultiplySubtract(Vector64<int> acc, Vector64<int> left, Vector64<int> right) { throw null; }
public static Vector128<byte> MultiplySubtract(Vector128<byte> acc, Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MultiplySubtract(Vector128<sbyte> acc, Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MultiplySubtract(Vector128<short> acc, Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MultiplySubtract(Vector128<uint> acc, Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MultiplySubtract(Vector128<int> acc, Vector128<int> left, Vector128<int> right) { throw null; }
/// <summary>
/// Vector multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtract(Vector64<byte> acc, Vector64<byte> left, byte right) { throw null; }
public static Vector64<sbyte> MultiplySubtract(Vector64<sbyte> acc, Vector64<sbyte> left, sbyte right) { throw null; }
public static Vector64<ushort> MultiplySubtract(Vector64<ushort> acc, Vector64<ushort> left, ushort right) { throw null; }
public static Vector64<short> MultiplySubtract(Vector64<short> acc, Vector64<short> left, short right) { throw null; }
public static Vector64<uint> MultiplySubtract(Vector64<uint> acc, Vector64<uint> left, uint right) { throw null; }
public static Vector64<int> MultiplySubtract(Vector64<int> acc, Vector64<int> left, int right) { throw null; }
public static Vector128<byte> MultiplySubtract(Vector128<byte> acc, Vector128<byte> left, byte right) { throw null; }
public static Vector128<sbyte> MultiplySubtract(Vector128<sbyte> acc, Vector128<sbyte> left, sbyte right) { throw null; }
public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, ushort right) { throw null; }
public static Vector128<short> MultiplySubtract(Vector128<short> acc, Vector128<short> left, short right) { throw null; }
public static Vector128<uint> MultiplySubtract(Vector128<uint> acc, Vector128<uint> left, uint right) { throw null; }
public static Vector128<int> MultiplySubtract(Vector128<int> acc, Vector128<int> left, int right) { throw null; }
/// <summary>
/// Vector fused multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> FusedMultiplyAdd(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector fused multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector64<float> FusedMultiplySubtract(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> FusedMultiplySubtract(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector polynomial multiply
/// Corresponds to vector forms of ARM64 PMUL
/// </summary>
public static Vector64<byte> PolynomialMultiply(Vector64<byte> left, Vector64<byte> right) { throw null; }
public static Vector64<sbyte> PolynomialMultiply(Vector64<sbyte> left, Vector64<sbyte> right) { throw null; }
public static Vector128<byte> PolynomialMultiply(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> PolynomialMultiply(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
/// Vector reciprocal estimate
///
/// See FRECPE docs
///
/// Corresponds to vector forms of ARM64 FRECPE
/// </summary>
public static Vector64<float> ReciprocalEstimate(Vector64<float> value) { throw null; }
public static Vector128<float> ReciprocalEstimate(Vector128<float> value) { throw null; }
/// <summary>
/// Vector reciprocal step
///
/// See FRECPS docs
///
/// Corresponds to vector forms of ARM64 FRECPS
/// </summary>
public static Vector64<float> ReciprocalStep(Vector64<float> left, Vector64<float> right, byte index) { throw null; }
public static Vector128<float> ReciprocalStep(Vector128<float> left, Vector128<float> right, byte index) { throw null; }
/// <summary>
/// Vector reciprocal square root estimate
///
/// See FRSQRTE docs
///
/// Corresponds to vector forms of ARM64 FRSQRTE
/// </summary>
public static Vector64<float> ReciprocalSquareRootEstimate(Vector64<float> value) { throw null; }
public static Vector128<float> ReciprocalSquareRootEstimate(Vector128<float> value) { throw null; }
/// <summary>
/// Vector reciprocal square root step
///
/// See FRSQRTS docs
///
/// Corresponds to vector forms of ARM64 FRSQRTS
/// </summary>
public static Vector64<float> ReciprocalSquareRootEstimate(Vector64<float> left, Vector64<float> right, byte index) { throw null; }
public static Vector128<float> ReciprocalSquareRootEstimate(Vector128<float> left, Vector128<float> right, byte index) { throw null; }
/// <summary>
/// Vector reverse element bytes
/// Corresponds to vector forms of ARM64 REV16, REV32, REV64
/// </summary>
public static Vector64<ushort> ReverseElementBytes(Vector64<ushort> value) { throw null; }
public static Vector64<short> ReverseElementBytes(Vector64<short> value) { throw null; }
public static Vector64<uint> ReverseElementBytes(Vector64<uint> value) { throw null; }
public static Vector64<int> ReverseElementBytes(Vector64<int> value) { throw null; }
public static Vector64<float> ReverseElementBytes(Vector64<float> value) { throw null; }
public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
public static Vector128<short> ReverseElementBytes(Vector128<short> value) { throw null; }
public static Vector128<uint> ReverseElementBytes(Vector128<uint> value) { throw null; }
public static Vector128<int> ReverseElementBytes(Vector128<int> value) { throw null; }
public static Vector128<ulong> ReverseElementBytes(Vector128<ulong> value) { throw null; }
public static Vector128<long> ReverseElementBytes(Vector128<long> value) { throw null; }
public static Vector128<float> ReverseElementBytes(Vector128<float> value) { throw null; }
public static class Arm32
{
public static bool IsSupported { get { throw null; } }
/// <summary>
/// Vector multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM32 VMLA
/// </summary>=
public static Vector64<float> MultiplyAdd(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MultiplyAdd(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM32 VMLA
/// </summary>
public static Vector64<float> MultiplyAdd(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> MultiplyAdd(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
/// <summary>
/// Vector multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM32 VMLS
/// </summary>
public static Vector64<float> MultiplySubtract(Vector64<float> acc, Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MultiplySubtract(Vector128<float> acc, Vector128<float> left, Vector128<float> right) { throw null; }
/// <summary>
/// Vector multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM32 VMLS
/// </summary>
public static Vector64<float> MultiplySubtract(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> MultiplySubtract(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
}
public static class Arm64
{
public static bool IsSupported { get { throw null; } }
/// <summary>
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD & FABD
/// </summary>
public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP & FADDP
/// </summary>
public static Vector128<T> AddPairwise<byte>(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<T> AddPairwise<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<T> AddPairwise<ushort>(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<T> AddPairwise<short>(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<long> AddPairwise<long>(Vector128<long> left, Vector128<long> right) { throw null; }
public static Vector128<ulong> AddPairwise<ulong>(Vector128<ulong> left, Vector128<ulong> right) { throw null; }
public static Vector128<T> AddPairwise<float>(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<T> AddPairwise<double>(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector128<double> ExtractVector<double>(Vector128<double> left, Vector128<double> right, byte index) { throw null; }
/// <summary>
/// Vector add across vector elements
/// Corresponds to vector forms of ARM64 ADDV
/// </summary>
public static byte AddAcross(Vector64<byte> value) { throw null; }
public static sbyte AddAcross(Vector64<sbyte> value) { throw null; }
public static ushort AddAcross(Vector64<ushort> value) { throw null; }
public static short AddAcross(Vector64<short> value) { throw null; }
public static uint AddAcross(Vector64<uint> value) { throw null; }
public static int AddAcross(Vector64<int> value) { throw null; }
public static byte AddAcross(Vector128<byte> value) { throw null; }
public static sbyte AddAcross(Vector128<sbyte> value) { throw null; }
public static ushort AddAcross(Vector128<ushort> value) { throw null; }
public static short AddAcross(Vector128<short> value) { throw null; }
public static uint AddAcross(Vector128<uint> value) { throw null; }
public static int AddAcross(Vector128<int> value) { throw null; }
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector128<double> MaxNumeric(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector max numeric pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 FMAXNMP
/// </summary>
public static Vector64<float> MaxNumericPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MaxNumericPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector max numeric across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 FMAXNMV
/// </summary>
public static float MaxNumericAcross(Vector128<float> value) { throw null; }
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
/// </summary>
public static Vector128<byte> MaxPairwise(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MaxPairwise(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MaxPairwise(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MaxPairwise(Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MaxPairwise(Vector128<int> left, Vector128<int> right) { throw null; }
public static Vector128<float> MaxPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector max across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 SMAXV, UMAXV & FMAXV
/// </summary>
public static byte MaxAcross(Vector64<byte> value) { throw null; }
public static sbyte MaxAcross(Vector64<sbyte> value) { throw null; }
public static ushort MaxAcross(Vector64<ushort> value) { throw null; }
public static short MaxAcross(Vector64<short> value) { throw null; }
public static uint MaxAcross(Vector64<uint> value) { throw null; }
public static int MaxAcross(Vector64<int> value) { throw null; }
public static float MaxAcross(Vector64<float> value) { throw null; }
public static byte MaxAcross(Vector128<byte> value) { throw null; }
public static sbyte MaxAcross(Vector128<sbyte> value) { throw null; }
public static ushort MaxAcross(Vector128<ushort> value) { throw null; }
public static short MaxAcross(Vector128<short> value) { throw null; }
public static uint MaxAcross(Vector128<uint> value) { throw null; }
public static int MaxAcross(Vector128<int> value) { throw null; }
public static ulong MaxAcross(Vector128<ulong> value) { throw null; }
public static long MaxAcross(Vector128<long> value) { throw null; }
public static float MaxAcross(Vector128<float> value) { throw null; }
public static double MaxAcross(Vector128<double> value) { throw null; }
/// <summary>
/// Vector min numeric
/// Corresponds to vector forms of ARM64 FMINNM
/// </summary>
public static Vector128<double> MinNumeric(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector min numeric pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 FMINNMP
/// </summary>
public static Vector64<float> MaxNumericPairwise(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MaxNumericPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector min numeric across
///
/// result = min(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 FMINNMV
/// </summary>
public static float MaxNumericAcross(Vector128<float> value) { throw null; }
/// <summary>
/// Vector min pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
/// </summary>
public static Vector128<byte> MinPairwise(Vector128<byte> left, Vector128<byte> right) { throw null; }
public static Vector128<sbyte> MinPairwise(Vector128<sbyte> left, Vector128<sbyte> right) { throw null; }
public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
public static Vector128<short> MinPairwise(Vector128<short> left, Vector128<short> right) { throw null; }
public static Vector128<uint> MinPairwise(Vector128<uint> left, Vector128<uint> right) { throw null; }
public static Vector128<int> MinPairwise(Vector128<int> left, Vector128<int> right) { throw null; }
public static Vector128<float> MinPairwise(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector min across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 SMINV, UMINV & FMINV
/// </summary>
public static byte MinAcross(Vector64<byte> value) { throw null; }
public static sbyte MinAcross(Vector64<sbyte> value) { throw null; }
public static ushort MinAcross(Vector64<ushort> value) { throw null; }
public static short MinAcross(Vector64<short> value) { throw null; }
public static uint MinAcross(Vector64<uint> value) { throw null; }
public static int MinAcross(Vector64<int> value) { throw null; }
public static float MinAcross(Vector64<float> value) { throw null; }
public static byte MinAcross(Vector128<byte> value) { throw null; }
public static sbyte MinAcross(Vector128<sbyte> value) { throw null; }
public static ushort MinAcross(Vector128<ushort> value) { throw null; }
public static short MinAcross(Vector128<short> value) { throw null; }
public static uint MinAcross(Vector128<uint> value) { throw null; }
public static int MinAcross(Vector128<int> value) { throw null; }
public static float MinAcross(Vector128<float> value) { throw null; }
public static double MinAcross(Vector128<double> value) { throw null; }
/// <summary>
/// Vector fused multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector128<double> FusedMultiplyAdd(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector fused multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> FusedMultiplyAdd(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
public static Vector128<float> FusedMultiplyAdd(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
/// <summary>
/// Vector fused multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector fused multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector64<float> FusedMultiplySubtract(Vector64<float> acc, Vector64<float> left, float right) { throw null; }
public static Vector128<float> FusedMultiplySubtract(Vector128<float> acc, Vector128<float> left, float right) { throw null; }
public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, float right) { throw null; }
/// <summary>
/// Vector multiply extend
///
/// For each element result[elem] = left[elem] * right[elem]
/// Handle extend special cases zero and infinite. FMULX
///
/// Corresponds to vector forms of ARM64 FMULX
/// </summary>
public static Vector64<float> MultiplyExtend(Vector64<float> left, Vector64<float> right) { throw null; }
public static Vector128<float> MultiplyExtend(Vector128<float> left, Vector128<float> right) { throw null; }
public static Vector128<double> MultiplyExtend(Vector128<double> left, Vector128<double> right) { throw null; }
/// <summary>
/// Vector multiply extend by element
///
/// For each element result[elem] = left[elem] * right
/// Handle extend special cases zero and infinite. FMULX
///
/// Corresponds to vector forms of ARM64 FMULX
/// </summary>
public static Vector64<float> MultiplyExtend(Vector64<float> left, float right) { throw null; }
public static Vector128<float> MultiplyExtend(Vector128<float> left, float right) { throw null; }
public static Vector128<double> MultiplyExtend(Vector128<double> left, double right) { throw null; }
/// Vector reciprocal estimate
///
/// See FRECPE docs
///
/// Corresponds to vector forms of ARM64 FRECPE
/// </summary>
public static Vector128<double> ReciprocalEstimate(Vector128<double> value) { throw null; }
/// <summary>
/// Vector reciprocal step
///
/// See FRECPS docs
///
/// Corresponds to vector forms of ARM64 FRECPS
/// </summary>
public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right, byte index) { throw null; }
/// <summary>
/// Vector reciprocal square root estimate
///
/// See FRSQRTE docs
///
/// Corresponds to vector forms of ARM64 FRSQRTE
/// </summary>
public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value) { throw null; }
/// <summary>
/// Vector reciprocal square root step
///
/// See FRSQRTS docs
///
/// Corresponds to vector forms of ARM64 FRSQRTS
/// </summary>
public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> left, Vector128<double> right, byte index) { throw null; }
/// <summary>
/// Vector reverse byte bits
/// Corresponds to vector forms of ARM64 RBIT
/// </summary>
public static Vector64<byte> ReverseElementBits(Vector64<byte> value) { throw null; }
public static Vector64<sbyte> ReverseElementBits(Vector64<sbyte> value) { throw null; }
public static Vector128<byte> ReverseElementBits(Vector128<byte> value) { throw null; }
public static Vector128<sbyte> ReverseElementBits(Vector128<sbyte> value) { throw null; }
}
}
}
alright, that should be the final list.
Thanks @TamarChristinaArm.
I'll give this a look over either tonight or tomorrow and get the top post updated 馃槃
@tannergooding What is the next step here for getting this reviewed? Is it ready to go?
cc @TamarChristinaArm @echesakovMSFT @CarolEidt @sdmaclea
What is the next step here for getting this reviewed? Is it ready to go?
I've updated the original post with @TamarChristinaArm's updated surface.
The next step is just ensuring we get a dedicated review session with @terrajobst. I'll bring it up again today and see if we can drive down a date.
I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.
@TamarChristinaArm, was this determining if VMLA (floating-point)
for arm32 is a fused operation? I believe that ended up being my only question and if that dictated them needing to be separate.
If so, it doesn't look like A32 has its own "fused" operation and we should remove those APIs from the general list (in the proposed surface, they look duplicated in both AdvSimd
and AdvSimd.Arm64
).
I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.
@TamarChristinaArm, was this determining if
VMLA (floating-point)
for arm32 is a fused operation? I believe that ended up being my only question and if that dictated them needing to be separate.If so, it doesn't look like A32 has its own "fused" operation and we should remove those APIs from the general list (in the proposed surface, they look duplicated in both
AdvSimd
andAdvSimd.Arm64
).
That's where indeed things got a bit confusing, on A32 the fused version of the instructions are called VFMA
, but it doesn't have a fused by element version of it.
A64 however only has fused MLA, and so doesn't have the non-fused variant, but does have a by element version of the fused variant.
This is why the split in definition above. The MLA is still useful on A32 if you don't care about the rounding because it does have a by element version then.
I will take this issue if no one is working on it.
I'm not working on it, AddAcross
and ReverseElementBits
I already did since those intersected with my list but didn't work on the rest.
@TamarChristinaArm @tannergooding @CarolEidt
I have a question - for MaxNumericPairwise
and MinNumericPairwise
there is no overloads that operate on one vector (i.e. Vector64<float> MaxNumericPairwise(Vector64<float> value)
) even though there is a c++ intrinsic float32_t vpmaxnms_f32 (float32x2_t a)
that maps to FMAXNMP Sd,Vn.2S
instruction. Same for the Vector128<double> MaxNumericPairwise(Vector128<double> value)
.
Is it intentional?
It might look odd if we had MaxNumericPairwise
that has overloads with 1 and 2 operands. Should we instead add MaxNumericAcross(Vector64<float> value)
that maps to FMAXNMP Sd,Vn.2S
?
I think the same could be done for MaxAcross(Vector64<float> value)
and FMAXP Sd,Vn.2S
To me it seems like these should have the same name, and have overloads with one or two operands, since the fundamental operation is the same. It's a bit weird because the operation is always pairwise, but the number of operations & results isn't always consistent for the one operand and two operand case, if I read it correctly (i.e the one operand form always produces a single result, while the two-operand case always operates on each pair in the concatenated vector, but that's simply a characteristic of the architecture that we're exposing.
@echesakovMSFT Thanks for reminding me, I was waiting for the API review to ask, but yes, so I personally think it the pair
single register versions should instead by under the reduction intrinsics.
So instead of having a single register MaxNumericPairwise(Vector64<float> value)
it should be under MaxAcross
.
In C we defined them under vmaxv
(Which I didn't here waiting to solicit feedback) and we put them under a new made up intrinsics name since we couldn't overload it. To me it seems more natural to add these single register pairwise operations as reductions.
Like @CarolEidt mentioned the operations aren't exactly the same if we overload MaxNumericPairwise
, and also I think we'd be breaking the convention we've used until now for the operations working on the scalar part of the SIMD file. So shouldn't the single register version be MaxNumericPairwiseScalar
then?
Or do we want both like in C? overload the reductions and the pair instructions?
So shouldn't the single register version be
MaxNumericPairwiseScalar
then?
This is my understanding of the conventions we have followed thus-far. The instructions is FMAXNMP (scalar)
and functionally it is a scalar (for which we have always used the Scalar
postfix in the name).
The confusion likely comes because, so far, scalar just means "lowest element" and so for something like Vector64<float> AddScalar(Vector64<float> lhs, Vector64<float> rhs)
, it adds Element 0 from lhs with Element 0 from rhs and returns it in Element 0 of the result.
In this case, the signature would be Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value)
which would operate on the scalar pair (Element 0 and Element 1) in value and return it in Element 0 of the result.
So, it is still scalar, it is just that we are "pair-wise" rather than "element-wise"
Also, to this note:
Vector64<float> AddScalar(Vector64<float> lhs, Vector64<float> rhs)
I believe the proposed signatures for things like the following are incorrect:
public static byte MinAcross(Vector64<byte> value) { throw null; }
public static sbyte MinAcross(Vector64<sbyte> value) { throw null; }
The instructions return the result in a SIMD&FP register, not in a general purpose register. So the result should remain Vector64<T>
. The purpose of the scalar variants is to remove the need to continuously transition between "scalar" code and "vector" code. If it is an intrinsic that operates on or returns a SIMD register (even if it only treats that value as a scalar), it takes and returns a Vector*<T>
.
float
for simple ops (like addition
) but must use HWIntrinsics for other things (like reciprocal
) (and it maintains consistency with the x86 intrinsics from an API perspective)I believe the proposed signatures for things like the following are incorrect:
Yes the reductions need to be updated. They were based on my understanding the time where I thought you could assign types to multiple register classes.
That said I still think the scalar pairwise operations should be reductions. I think the fact that we exposed them as extra intrinsics as well in C was a mistake. But If we expose them as Scalar operations here I think they should also be an overload for the reductions.
I agree that functionally speaking it could be exposed as either Vector64<float> MaxNumericAcross(Vector64<float> value)
or as Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value)
.
Which is ultimately chosen will probably come down to API review. I have a slight preference for the latter since the actual instruction is FMAXNMP (scalar)
, but the former may be easier for some to understand.
I imagine with the former, a misunderstanding might be whether the result is only in element 0 or if it is duplicated throughout (but the same applies for the other "across" methods as well).
In my opinion, something that operates on multiple elements within a vector should not have the Scalar
suffix. If that's the naming chosen, then I would reiterate my request for a clear explanation of what we mean by that suffix.
I agree with Carol - in my opinion, exposing FMAXNMP Sd, Vn.2S
as MaxNumericAcross(Vector64<float> value)
would introduce less confusion that going with MaxNumericPairwiseScalar
.
And as it was mentioned above we would need to rename all AddAcross
and other reduction functions which wouldn't add a value to understanding what these functions are doing - it's already clear from the name.
I think we should use Scalar
suffix only to distinguish overloads of an intrinsic that have same set of argument in its "Vector" version.
If that's the naming chosen, then I would reiterate my request for a clear explanation of what we mean by that suffix.
Scalar means it operates on a single "element", its what "element" is that can vary.
In the majority case an "element" is just float
or int
or double
For example: Vector128<float>
is a vector of float
scalars.
In the pairwise case, the "element" is pairs of values (e.g. float, float
). For example: Vector128<float>
when talking about Pairwise, is a vector of (float, float)
scalars.
It is still a scalar and the definition hasn't changed. It still only operates on one element (one pair).
It is how you have to infer this when referring to the actual ARM instructions (such as FINMP (scalar)
which is Floating-point Minimum of Pair of elements (scalar)
).
horizontal
(pairwise
) instructions that only take one input.If we had the proper support for it, the technically correct signature would be: Vector64<float> MaxNumericPairwiseScalar(Vector64<(float, float)> value)
; but that is a lot of complexity that is likely not worthwhile.
It's also worth noting users familiar with ARM assembly would likely not look for it under Across
since their is no across instruction for it, even if they are functionally equivalent.
@tannergooding - I disagree. A pair is a vector not a scalar, albeit a smaller vector. I think the Scalar
suffix is confusing enough without having it refer to pairs as well. I really think that we need some API design review of the naming conventions for this.
Scalar suffix could be reserved for cases which consume a vector, but produce a scalar return value. As opposed to a vector<scalar>
return value.
If we had the proper support for it, the technically correct signature would be: Vector64
MaxNumericPairwiseScalar(Vector64<(float, float)> value); but that is a lot of complexity that is likely not worthwhile.
I disagree, if we had proper support I would have expected float MaxNumericPairwise(Vector64<float>)
as it's intention is to just add two "pairs" inside a single vector. or rather two adjacent entries. https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?search=vpadds_f32
Broadly speaking the ISA splits instructions into tree classes Vector
, Scalar
and By Element
.
So far we've adhered to adding the Scalar
suffix to instructions that the ISA quite literally placed in the Scalar
category. i.e. the ones where in the ArmARM is has (scalar)
after the instruction.
I think that's the clearest distinction. I was mistaken before when I suggested renaming the reductions (deleted the comment since I accidentally made it from the wrong account).
It's also worth noting users familiar with ARM assembly would likely not look for it under Across since their is no across instruction for it, even if they are functionally equivalent.
All of these names already deviate somewhat from the mnemonic though. I though the explicit intention was to make more descriptive names for them. I would expect a user to search the docs for the mnemonic they want to find what to use, isn't that partially why we're adding them to the docs?
Scalar suffix could be reserved for cases which consume a vector, but produce a scalar return value. As opposed to a vector
return value.
It is too late for that, x86 already has shipped and has things like Vector128<float> Sse.AddScalar(Vector128<float> left, Vector128<float> right)
On x86, this is important because x86 preserves the upper bits and returning or taking float
would mean the entire value isn't preserved.
On ARM, the upper bits are zeroed, but the API design decisions would then be inconsistent.
I really think that we need some API design review of the naming conventions for this.
I agree.
A pair is a vector not a scalar, albeit a smaller vector. I think the Scalar suffix is confusing enough without having it refer to pairs as well.
I think that depends on how you look at it. We may be able to treat it as "across" in this particular case (Vector64<float>
) but we wouldn't be able to treat it as across
for Vector64<half>
(the entire vector isn't consumed, just the lowest scalar pair)
I though the explicit intention was to make more descriptive names for them. I would expect a user to search the docs for the mnemonic they want to find what to use, isn't that partially why we're adding them to the docs
Yes, but we have also not had to deviate from the descriptions/names so far and have fallen back to them in past API reviews when naming concerns came up.
I believe the case of the half-precision variant
for FMINNMP
will likely be a big deciding factor in the name. It can't be named Across
(or AcrossScalar
) and has the same issue (it operates on a scalar pair and returns a scalar value).
I think that depends on how you look at it. We may be able to treat it as "across" in this particular case (Vector64
) but we wouldn't be able to treat it as across for Vector64 (the entire vector isn't consumed, just the lowest scalar pair)
true, FP16 will add a lot of confusion to this story. My preference is to have both. Is there any downside to that? Operationally and logically it's sound for float
and double
.
@TamarChristinaArm Just to confirm you mean both intrinsics
c#
Vector64<float> MaxNumericAcross(Vector64<float> value)
Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value)
mapping to MAXNMP
?
@echesakovMSFT Yeah exactly. Though the first one should be MaxAcross
to match the rest. (actually wonder if we need the Numeric
in the pair operations but that's a different topic)
It can't be MaxAcross
, the floating-point versions have both Max*
and MaxNumeric*
and they are different.
The former propagates NaN and the latter does not (each is compliant with a different IEEE 754 operation).
@TamarChristinaArm I though MaxNumericAcross
maps to FMAXNMV
for Vector128<float> value
? So it would be logical to name FMAXNMP
the same way?
@tannergooding
I believe the case of the half-precision variant for FMINNMP will likely be a big deciding factor in the name. It can't be named Across (or AcrossScalar) and has the same issue (it operates on a scalar pair and returns a scalar value).
I am not sure if this is true. I don't see fmaxnmp
that operates on float16x2_t
(is there even a type like this?) meaning it operates on full 64-bit vector register and returns scalar float16
.
If I were to define one it would be
c#
Vector64<float16> MaxNumericAcross(Vector64<float16> value)
@echesakovMSFT @tannergooding You're right, I forgot that MaxNumericAcross
maps to FMAXNMV
.
So you were correct with
Vector64<float> MaxNumericAcross(Vector64<float> value)
Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value)
MaxAcross
should be FMAXV
.
I am not sure if this is true. I don't see fmaxnmp that operates on float16x2_t (is there even a type like this?) meaning it operates on full 64-bit vector register and returns scalar float16.
No we don't have a float16x2_t
but the size modifier on the instruction is .2H
so it only consumes the lower two elements.
@echesakovMSFT, I don't believe it is on the ARM Neon Intrinsics page (possibly because it is an ARMv8.2 instruction).
It is, however, detailed in the architecture manual (including confirming the operation and instruction) and is different from the non-scalar version which operates on the full vector.
@TamarChristinaArm @tannergooding I see it now it the isa manual, thanks
@echesakovMSFT, I don't believe it is on the ARM Neon Intrinsics page (possibly because it is an ARMv8.2 instruction).
The neon pages go all the way to Armv8.6, I think it's just we never defined it for ACLE. not sure why not.
Another question. For multiply-add and fused multiply-add there are by element and vector forms. For example,
c#
static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, Vector64<float> right);
static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, float right);
The last one is implemented with FMLA <Vd>.<T>, <Vn>.<T>, <Vm>.<Ts>[<index>]
where index is 0, right?
Would someone ever need the following one?
c#
static Vector64<float> FusedMultiplyAddByElement(Vector64<float> acc, Vector64<float> left, Vector64<float> right, byte index);
@echesakovMSFT, looks like it functionally does: op3[..] + op1[..] * op2[index]
(where ..
is from 0 to Count for the vector version and just for 0 for the scalar version).
acc + left * right
-- the encoded operands and the order listed in the above function differI would say it is a reasonable thing to need and avoids the need to permute the value throughout a register.
It would be particularly useful when you have several constants you are using and you want to reduce the total number of registers being consumed.
The last one is implemented with FMLA
. , . , . [ ] where index is 0, right?
Yup that's correct @echesakovMSFT
The last one is implemented with FMLA ., ., .[] where index is 0, right?
Why are we exposing one that hardcodes the index as zero, rather than exposing the full functionality of the instruction (I don't see an instruction that does so, just the one that takes an index)?
The last one is implemented with FMLA ., ., .[] where index is 0, right?
Why are we exposing one that hardcodes the index as zero, rather than exposing the full functionality of the instruction (I don't see an instruction that does so, just the one that takes an index)?
It's a convenience function for an often used function. It doesn't replace the index one it's just in addition to. At the time I wrote these I noticed we didn't expose any of the index
variants so I didn't add them as I only updated the given list to fit the new scheme.
As for why it's useful? because otherwise you'd have to create the vector type first, insert the element and rely on the optimizers to optimize the vector creation away.
In C this would be the difference between
float32x4_t f(float32x4_t a, float32x4_t b, float32_t c)
{
return vfmaq_n_f32 (a, b, c);
}
float32x4_t g(float32x4_t a, float32x4_t b, float32_t c)
{
float32x4_t tmp;
tmp = vsetq_lane_f32 (c, tmp, 0);
return vfmaq_laneq_f32 (a, b, tmp, 0);
}
For the latter, we have Vector128.CreateScalarUnsafe
which is basically treated as a "nop" for types (like float
/double
) which are already in the correct register type.
Right, but that's somewhat harder to find isn't it? Ultimately I also think intrinsics should make things easy and exposing that overload or any of the _n_
ones in ACLE make them easier to use.
Right, but that's somewhat harder to find isn't it
I think its just a question of getting familiar with the APIs. It's the only way to do such operations with the x86 APIs and is likely the pattern that will get the most scrutiny and optimizations (since it is more generally applicable and not specific to a particular intrinsic, instruction, or API).
I also think intrinsics should make things easy and exposing that overload or any of the _n_ ones in ACLE make them easier to use.
I don't disagree. There are likely several scenarios where helper methods for common patterns may be beneficial. However, we also have only exposed a very limited number of helper methods so far and it would need to be something we take through API review and consider separately from the mainline API (most of these helper methods are trivial for users to write in terms of the "core" API).
Fair enough.
I think its just a question of getting familiar with the APIs. It's the only way to do such operations with the x86 APIs and is likely the pattern that will get the most scrutiny and optimizations (since it is more generally applicable and not specific to a particular intrinsic, instruction, or API).
That does somewhat concern me. They are platform intrinsics after all, so whether or not something is available on x86 shouldn't factor in.
so whether or not something is available on x86 shouldn't factor in
x86 likewise has "scalar" instructions and could have exposed APIs that directly took a float
; but it was determined to be better to have a pattern based approach around Vector128.CreateScalarUnsafe
and Vector128.ToScalar
which allows you to efficiently do this in a platform agnostic way. It significantly cut down on the number of overloads we needed to expose for these scalar APIs and still ensures efficient codegen.
There are a few platform agnostic helper methods located in the Vector64
/Vector128
/Vector256
classes (they exist separately from Vector64<T>
/Vector128<T>
/Vector256<T>
). They expose a number of APIs which allow you to interact with the types even if hardware acceleration isn't available (which can be useful for a software fallback and debugging purposes). The APIs exposed are:
As
and As*
(e.g. AsByte
, AsDouble
, etc). These allow a reinterpret cast from a type T
to a type U
and are functionally a nopCreate
. These allow constructing a vector of type T
. There are both "broadcast" and per element initializersCreateScalar
. Allows constructing a vector of type T
where the lowest element is set and the upper elements are 0
CreateScalarUnsafe
. Allows constructing a vector of type T
where the lowest element is set and the upper elements are non-deterministic (this allows conversion from float
to Vector*<float>
at zero cost since they both live in the same register kind; for example)GetElement
and WithElement
. Allows getting/setting the given element of the vector (valid indices are from 0 to Count)GetLower
/GetUpper
and WithLower
/WithUpper
. Allows getting/setting the upper/lower Vector
(Vector128<T>
is "comprised" of two Vector64<T>
, for example)ToScalar
. Allows conversion from Vector128<T>
to T
. For types like float
, this can be a nop.ToVector256
. Allows upcasting from Vector64<T>
/Vector128<T>
to Vector128<T>
/Vector256<T>
(respectively) with explicit zeroing of the upper bitsToVector256Unsafe
. Allows upcasting from Vector64<T>
/Vector128<T>
to Vector128<T>
/Vector256<T>
(respectively) leaving the upper bits "non-deterministic" (allows a nop on some platforms)These functions are common and necessary even for the software fallback case to be able to correctly interact with the types. They can also have varying implementations based on the what ISAs are available (e.g. on x86, you may want to use broadcast
, permute
, or shuffle
to create a vector with all elements set to a given value; depending on what hardware functionality is available).
As per https://github.com/dotnet/runtime/pull/31899#discussion_r376563233, we should discuss the ordering of parameters for FMA when those are reviewed.
The Min/MaxNumeric
functions should probably be Min/MaxNumber
to match the IEEE and instruction names (and the proposed names for the equivalent Math/MathF
functions).
I think a couple of the *Estimate
functions are meant to be *Step
based on the underlying instruction name?
MaxPairwise
/MinPairwise
should also include forms for Vector128
ReciprocalStep
and ReciprocalSquareRootStep
be something more specific than left
and right
?ReverseElementBytes
needs more work, as well as clarity on supportArm32
because it's not going to be implemented for .NET 5ExtractVector
shouldn't have overloads for float
and double
, it could end up silently modifying/normalizing/corrupting the floating point typesMaxNumericPairwiseScalar
should be MaxNumbercPairwiseScalar
. Some folks raised concerns around PairwiseScalar
being confusing, but it matches the ISA name and we can't think of a better name```C#
namespace System.Runtime.Intrinsics.Arm
{
public partial class AdvSimd
{
///
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
///
public static Vector64
public static Vector128
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector64<float> AbsoluteCompareGreaterThan(Vector64<float> left, Vector64<float> right);
public static Vector128<float> AbsoluteCompareGreaterThan(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
/// </summary>
public static Vector64<byte> AbsoluteDifference(Vector64<byte> left, Vector64<byte> right);
public static Vector64<byte> AbsoluteDifference(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> AbsoluteDifference(Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<ushort> AbsoluteDifference(Vector64<short> left, Vector64<short> right);
public static Vector64<uint> AbsoluteDifference(Vector64<uint> left, Vector64<uint> right);
public static Vector64<uint> AbsoluteDifference(Vector64<int> left, Vector64<int> right);
public static Vector64<float> AbsoluteDifference(Vector64<float> left, Vector64<float> right);
public static Vector128<byte> AbsoluteDifference(Vector128<byte> left, Vector128<byte> right);
public static Vector128<byte> AbsoluteDifference(Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<ushort> AbsoluteDifference(Vector128<short> left, Vector128<short> right);
public static Vector128<uint> AbsoluteDifference(Vector128<uint> left, Vector128<uint> right);
public static Vector128<uint> AbsoluteDifference(Vector128<int> left, Vector128<int> right);
public static Vector128<float> AbsoluteDifference(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector absolute difference add
///
/// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
///
/// Corresponds to vector forms of ARM64 SABA, UABA
/// </summary>
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<byte> addend, Vector64<byte> left, Vector64<byte> right);
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<sbyte> addend, Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<ushort> addend, Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<short> addend, Vector64<short> left, Vector64<short> right);
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<uint> addend, Vector64<uint> left, Vector64<uint> right);
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<int> addend, Vector64<int> left, Vector64<int> right);
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<byte> addend, Vector128<byte> left, Vector128<byte> right);
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<sbyte> addend, Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short> addend, Vector128<short> left, Vector128<short> right);
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<uint> addend, Vector128<uint> left, Vector128<uint> right);
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<int> addend, Vector128<int> left, Vector128<int> right);
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP, and FADDP
/// </summary>
public static Vector64<byte> AddPairwise(Vector64<byte> left, Vector64<byte> right) ;
public static Vector64<sbyte> AddPairwise(Vector64<sbyte> left, Vector64<sbyte> right) ;
public static Vector64<ushort> AddPairwise(Vector64<ushort> left, Vector64<ushort> right) ;
public static Vector64<short> AddPairwise(Vector64<short> left, Vector64<short> right) ;
public static Vector64<int> AddPairwise(Vector64<int> left, Vector64<int> right) ;
public static Vector64<uint> AddPairwise(Vector64<uint> left, Vector64<uint> right) ;
public static Vector64<float> AddPairwise(Vector64<float> left, Vector64<float> right) ;
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector64<byte> ExtractVector64(Vector64<byte> upper, Vector64<byte> lower, byte byteIndex);
public static Vector64<sbyte> ExtractVector64(Vector64<sbyte> upper, Vector64<sbyte> lower, byte byteIndex);
public static Vector64<short> ExtractVector64(Vector64<short> upper, Vector64<short> lower, byte byteIndex);
public static Vector64<ushort> ExtractVector64(Vector64<ushort> upper, Vector64<ushort> lower, byte byteIndex);
public static Vector64<int> ExtractVector64(Vector64<int> upper, Vector64<int> lower, byte byteIndex);
public static Vector64<uint> ExtractVector64(Vector64<uint> upper, Vector64<uint> lower, byte byteIndex);
public static Vector128<byte> ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
public static Vector128<sbyte> ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
public static Vector128<short> ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
public static Vector128<int> ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
public static Vector128<uint> ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
public static Vector128<long> ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
public static Vector128<ulong> ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
public static Vector128<float> ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector64<float> MaxNumber(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MaxNumber(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
/// </summary>
public static Vector64<byte> MaxPairwise(Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MaxPairwise(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MaxPairwise(Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MaxPairwise(Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MaxPairwise(Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MaxPairwise(Vector64<int> left, Vector64<int> right);
public static Vector64<float> MaxPairwise(Vector64<float> left, Vector64<float> right);
/// <summary>
/// Vector min numeric
/// Corresponds to vector forms of ARM64 FMINNM
/// </summary>
public static Vector64<float> MinNumber(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MinNumber(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector min pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
/// </summary>
public static Vector64<byte> MinPairwise(Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MinPairwise(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MinPairwise(Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MinPairwise(Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MinPairwise(Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MinPairwise(Vector64<int> left, Vector64<int> right);
public static Vector64<float> MinPairwise(Vector64<float> left, Vector64<float> right);
/// <summary>
/// Vector multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAdd(Vector64<byte> addend, Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MultiplyAdd(Vector64<sbyte> addend, Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MultiplyAdd(Vector64<ushort> addend, Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MultiplyAdd(Vector64<short> addend, Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MultiplyAdd(Vector64<uint> addend, Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MultiplyAdd(Vector64<int> addend, Vector64<int> left, Vector64<int> right);
public static Vector128<byte> MultiplyAdd(Vector128<byte> addend, Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> MultiplyAdd(Vector128<sbyte> addend, Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> MultiplyAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> MultiplyAdd(Vector128<short> addend, Vector128<short> left, Vector128<short> right);
public static Vector128<uint> MultiplyAdd(Vector128<uint> addend, Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> MultiplyAdd(Vector128<int> addend, Vector128<int> left, Vector128<int> right);
/// <summary>
/// Vector multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAddBySelectedScalar(Vector64<byte> addend, Vector64<byte> left, Vector64<byte> right, byte rightIndex);
public static Vector64<sbyte> MultiplyAddBySelectedScalar(Vector64<sbyte> addend, Vector64<sbyte> left, Vector64<sbyte> right, byte rightIndex);
public static Vector64<ushort> MultiplyAddBySelectedScalar(Vector64<ushort> addend, Vector64<ushort> left, Vector64<ushort> right, byte rightIndex);
public static Vector64<short> MultiplyAddBySelectedScalar(Vector64<short> addend, Vector64<short> left, Vector64<short> right, byte rightIndex);
public static Vector64<uint> MultiplyAddBySelectedScalar(Vector64<uint> addend, Vector64<uint> left, Vector64<uint> right, byte rightIndex);
public static Vector64<int> MultiplyAddBySelectedScalar(Vector64<int> addend, Vector64<int> left, Vector64<int> right, byte rightIndex);
public static Vector128<byte> MultiplyAddBySelectedScalar(Vector128<byte> addend, Vector128<byte> left, Vector128<byte> right, byte rightIndex);
public static Vector128<sbyte> MultiplyAddBySelectedScalar(Vector128<sbyte> addend, Vector128<sbyte> left, Vector128<sbyte> right, byte rightIndex);
public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
public static Vector128<short> MultiplyAddBySelectedScalar(Vector128<short> addend, Vector128<short> left, Vector128<short> right, byte rightIndex);
public static Vector128<uint> MultiplyAddBySelectedScalar(Vector128<uint> addend, Vector128<uint> left, Vector128<uint> right, byte rightIndex);
public static Vector128<int> MultiplyAddBySelectedScalar(Vector128<int> addend, Vector128<int> left, Vector128<int> right, byte rightIndex);
/// <summary>
/// Vector multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtract(Vector64<byte> minuend, Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MultiplySubtract(Vector64<sbyte> minuend, Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MultiplySubtract(Vector64<ushort> minuend, Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MultiplySubtract(Vector64<short> minuend, Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MultiplySubtract(Vector64<uint> minuend, Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MultiplySubtract(Vector64<int> minuend, Vector64<int> left, Vector64<int> right);
public static Vector128<byte> MultiplySubtract(Vector128<byte> minuend, Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> MultiplySubtract(Vector128<sbyte> minuend, Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> MultiplySubtract(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> MultiplySubtract(Vector128<short> minuend, Vector128<short> left, Vector128<short> right);
public static Vector128<uint> MultiplySubtract(Vector128<uint> minuend, Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> MultiplySubtract(Vector128<int> minuend, Vector128<int> left, Vector128<int> right);
/// <summary>
/// Vector multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtractBySelectedScalar(Vector64<byte> minuend, Vector64<byte> left, Vector64<byte> right, byte rightIndex);
public static Vector64<sbyte> MultiplySubtractBySelectedScalar(Vector64<sbyte> minuend, Vector64<sbyte> left, Vector64<sbyte> right, byte rightIndex);
public static Vector64<ushort> MultiplySubtractBySelectedScalar(Vector64<ushort> minuend, Vector64<ushort> left, Vector64<ushort> right, byte rightIndex);
public static Vector64<short> MultiplySubtractBySelectedScalar(Vector64<short> minuend, Vector64<short> left, Vector64<short> right, byte rightIndex);
public static Vector64<uint> MultiplySubtractBySelectedScalar(Vector64<uint> minuend, Vector64<uint> left, Vector64<uint> right, byte rightIndex);
public static Vector64<int> MultiplySubtractBySelectedScalar(Vector64<int> minuend, Vector64<int> left, Vector64<int> right, byte rightIndex);
public static Vector128<byte> MultiplySubtractBySelectedScalar(Vector128<byte> minuend, Vector128<byte> left, Vector128<byte> right, byte rightIndex);
public static Vector128<sbyte> MultiplySubtractBySelectedScalar(Vector128<sbyte> minuend, Vector128<sbyte> left, Vector128<sbyte> right, byte rightIndex);
public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
public static Vector128<short> MultiplySubtractBySelectedScalar(Vector128<short> minuend, Vector128<short> left, Vector128<short> right, byte rightIndex);
public static Vector128<uint> MultiplySubtractBySelectedScalar(Vector128<uint> minuend, Vector128<uint> left, Vector128<uint> right, byte rightIndex);
public static Vector128<int> MultiplySubtractBySelectedScalar(Vector128<int> minuend, Vector128<int> left, Vector128<int> right, byte rightIndex);
/// <summary>
/// Vector fused multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector64<float> FusedMultiplyAdd(Vector64<float> addend, Vector64<float> left, Vector64<float> right);
public static Vector128<float> FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector fused multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector64<float> FusedMultiplySubtract(Vector64<float> minuend, Vector64<float> left, Vector64<float> right);
public static Vector128<float> FusedMultiplySubtract(Vector128<float> minuend, Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector polynomial multiply
/// Corresponds to vector forms of ARM64 PMUL
/// </summary>
public static Vector64<byte> PolynomialMultiply(Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> PolynomialMultiply(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector128<byte> PolynomialMultiply(Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> PolynomialMultiply(Vector128<sbyte> left, Vector128<sbyte> right);
/// Vector reciprocal estimate
///
/// See FRECPE docs
///
/// Corresponds to vector forms of ARM64 FRECPE
/// </summary>
public static Vector64<float> ReciprocalEstimate(Vector64<float> value);
public static Vector128<float> ReciprocalEstimate(Vector128<float> value);
/// <summary>
/// Vector reciprocal step
///
/// See FRECPS docs
///
/// Corresponds to vector forms of ARM64 FRECPS
/// </summary>
public static Vector64<float> ReciprocalStep(Vector64<float> left, Vector64<float> right);
public static Vector128<float> ReciprocalStep(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector reciprocal square root estimate
///
/// See FRSQRTE docs
///
/// Corresponds to vector forms of ARM64 FRSQRTE
/// </summary>
public static Vector64<float> ReciprocalSquareRootEstimate(Vector64<float> value);
public static Vector128<float> ReciprocalSquareRootEstimate(Vector128<float> value);
/// <summary>
/// Vector reciprocal square root step
///
/// See FRSQRTS docs
///
/// Corresponds to vector forms of ARM64 FRSQRTS
/// </summary>
public static Vector64<float> ReciprocalSquareRootStep(Vector64<float> left, Vector64<float> right);
public static Vector128<float> ReciprocalSquareRootStep(Vector128<float> left, Vector128<float> right);
public partial class Arm64
{
/// <summary>
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
/// </summary>
public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP, and FADDP
/// </summary>
public static Vector128<byte> AddPairwise(Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> AddPairwise(Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> AddPairwise(Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> AddPairwise(Vector128<short> left, Vector128<short> right);
public static Vector128<uint> AddPairwise(Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> AddPairwise(Vector128<int> left, Vector128<int> right);
public static Vector128<long> AddPairwise(Vector128<long> left, Vector128<long> right);
public static Vector128<ulong> AddPairwise(Vector128<ulong> left, Vector128<ulong> right);
public static Vector128<float> AddPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> AddPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector128<double> ExtractVector(Vector128<double> left, Vector128<double> right, byte index);
/// <summary>
/// Vector add across vector elements
/// Corresponds to vector forms of ARM64 ADDV
/// </summary>
public static Vector64<byte> AddAcross(Vector64<byte> value);
public static Vector64<sbyte> AddAcross(Vector64<sbyte> value);
public static Vector64<ushort> AddAcross(Vector64<ushort> value);
public static Vector64<short> AddAcross(Vector64<short> value);
public static Vector64<uint> AddAcross(Vector64<uint> value);
public static Vector64<int> AddAcross(Vector64<int> value);
public static Vector64<byte> AddAcross(Vector128<byte> value);
public static Vector64<sbyte> AddAcross(Vector128<sbyte> value);
public static Vector64<ushort> AddAcross(Vector128<ushort> value);
public static Vector64<short> AddAcross(Vector128<short> value);
public static Vector64<uint> AddAcross(Vector128<uint> value);
public static Vector64<int> AddAcross(Vector128<int> value);
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector128<double> MaxNumber(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector max numeric pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 FMAXNMP
/// </summary>
public static Vector64<float> MaxNumberPairwise(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MaxNumberPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> MaxNumberPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector max numeric across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 FMAXNMV
/// </summary>
public static Vector64<float> MaxNumberAcross(Vector128<float> value);
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
/// </summary>
public static Vector128<byte> MaxPairwise(Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> MaxPairwise(Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> MaxPairwise(Vector128<short> left, Vector128<short> right);
public static Vector128<uint> MaxPairwise(Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> MaxPairwise(Vector128<int> left, Vector128<int> right);
public static Vector128<float> MaxPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector max across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 SMAXV, UMAXV, and FMAXV
/// </summary>
public static Vector64<byte> MaxAcross(Vector64<byte> value);
public static Vector64<sbyte> MaxAcross(Vector64<sbyte> value);
public static Vector64<ushort> MaxAcross(Vector64<ushort> value);
public static Vector64<short> MaxAcross(Vector64<short> value);
public static Vector64<uint> MaxAcross(Vector64<uint> value);
public static Vector64<int> MaxAcross(Vector64<int> value);
public static Vector64<float> MaxAcross(Vector64<float> value);
public static Vector64<byte> MaxAcross(Vector128<byte> value);
public static Vector64<sbyte> MaxAcross(Vector128<sbyte> value);
public static Vector64<ushort> MaxAcross(Vector128<ushort> value);
public static Vector64<short> MaxAcross(Vector128<short> value);
public static Vector64<uint> MaxAcross(Vector128<uint> value);
public static Vector64<int> MaxAcross(Vector128<int> value);
public static Vector64<ulong> MaxAcross(Vector128<ulong> value);
public static Vector64<long> MaxAcross(Vector128<long> value);
public static Vector64<float> MaxAcross(Vector128<float> value);
public static Vector64<double> MaxAcross(Vector128<double> value);
// Not reviewed:
//
// ///
// /// Vector min numeric
// /// Corresponds to vector forms of ARM64 FMINNM
// ///
// public static Vector128
//
// ///
// /// Vector min numeric pairwise
// ///
// /// For each element result[elem] = 2elem < result.Length ? min(left[2elem], left[2byte + 1]) : min(right[2byte - result.Length], right[2byte + 1 - result.Length])
// ///
// /// Corresponds to vector forms of ARM64 FMINNMP
// ///
// public static Vector64
// public static Vector128
// public static Vector128
//
// ///
// /// Vector min numeric across
// ///
// /// result = min(value[0], ... , value[length -1])
// ///
// /// Corresponds to vector forms of ARM64 FMINNMV
// ///
// public static float MinNumberAcross(Vector128
//
// ///
// /// Vector min pairwise
// ///
// /// For each element result[elem] = 2elem < result.Length ? min(left[2elem], left[2byte + 1]) : min(right[2byte - result.Length], right[2byte + 1 - result.Length])
// ///
// /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
// ///
// public static Vector128
// public static Vector128
// public static Vector128
// public static Vector128
// public static Vector128
// public static Vector128
// public static Vector128
// public static Vector128
//
// ///
// /// Vector min across
// ///
// /// result = max(value[0], ... , value[length -1])
// ///
// /// Corresponds to vector forms of ARM64 SMINV, UMINV, and FMINV
// ///
// public static byte MinAcross(Vector64
// public static sbyte MinAcross(Vector64
// public static ushort MinAcross(Vector64
// public static short MinAcross(Vector64
// public static uint MinAcross(Vector64
// public static int MinAcross(Vector64
// public static float MinAcross(Vector64
// public static byte MinAcross(Vector128
// public static sbyte MinAcross(Vector128
// public static ushort MinAcross(Vector128
// public static short MinAcross(Vector128
// public static uint MinAcross(Vector128
// public static int MinAcross(Vector128
// public static float MinAcross(Vector128
// public static double MinAcross(Vector128
//
// ///
// /// Vector fused multiply add
// ///
// /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
// ///
// /// Corresponds to vector forms of ARM64 FMLA
// ///
// public static Vector128
//
// ///
// /// Vector fused multiply add by element
// ///
// /// For each element result[elem] = acc[elem] + left[elem] * right
// ///
// /// Corresponds to vector forms of ARM64 FMLA
// ///
// public static Vector64
// public static Vector128
// public static Vector128
//
// ///
// /// Vector fused multiply subtract
// ///
// /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
// ///
// /// Corresponds to vector forms of ARM64 FMLS
// ///
// public static Vector128
//
// ///
// /// Vector fused multiply subtract by element
// ///
// /// For each element result[elem] = acc[elem] - left[elem] * right
// ///
// /// Corresponds to vector forms of ARM64 FMLS
// ///
// public static Vector64
// public static Vector128
// public static Vector128
//
// ///
// /// Vector multiply extend
// ///
// /// For each element result[elem] = left[elem] * right[elem]
// /// Handle extend special cases zero and infinite. FMULX
// ///
// /// Corresponds to vector forms of ARM64 FMULX
// ///
// public static Vector64
// public static Vector128
// public static Vector128
//
// ///
// /// Vector multiply extend by element
// ///
// /// For each element result[elem] = left[elem] * right
// /// Handle extend special cases zero and infinite. FMULX
// ///
// /// Corresponds to vector forms of ARM64 FMULX
// ///
// public static Vector64
// public static Vector128
// public static Vector128
//
// /// Vector reciprocal estimate
// ///
// /// See FRECPE docs
// ///
// /// Corresponds to vector forms of ARM64 FRECPE
// ///
// public static Vector128
//
// ///
// /// Vector reciprocal step
// ///
// /// See FRECPS docs
// ///
// /// Corresponds to vector forms of ARM64 FRECPS
// ///
// public static Vector128
//
// ///
// /// Vector reciprocal square root estimate
// ///
// /// See FRSQRTE docs
// ///
// /// Corresponds to vector forms of ARM64 FRSQRTE
// ///
// public static Vector128
//
// ///
// /// Vector reciprocal square root step
// ///
// /// See FRSQRTS docs
// ///
// /// Corresponds to vector forms of ARM64 FRSQRTS
// ///
// public static Vector128
//
// ///
// /// Vector reverse byte bits
// /// Corresponds to vector forms of ARM64 RBIT
// ///
// public static Vector64
// public static Vector64
// public static Vector128
// public static Vector128
}
}
}
MaxPairwise
/MinPairwise
should also include forms for Vector128
ReciprocalStep
and ReciprocalSquareRootStep
be something more specific than left
and right
?ReverseElementBytes
needs more work, as well as clarity on supportArm32
because it's not going to be implemented for .NET 5ExtractVector
shouldn't have overloads for float
and double
, it could end up silently modifying/normalizing/corrupting the floating point typesMaxNumericPairwiseScalar
should be MaxNumberPairwiseScalar
. Some folks raised concerns around PairwiseScalar
being confusing, but it matches the ISA name and we can't think of a better name```C#
namespace System.Runtime.Intrinsics.Arm
{
public partial class AdvSimd
{
///
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
///
public static Vector64
public static Vector128
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector64<float> AbsoluteCompareGreaterThan(Vector64<float> left, Vector64<float> right);
public static Vector128<float> AbsoluteCompareGreaterThan(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
/// </summary>
public static Vector64<byte> AbsoluteDifference(Vector64<byte> left, Vector64<byte> right);
public static Vector64<byte> AbsoluteDifference(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> AbsoluteDifference(Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<ushort> AbsoluteDifference(Vector64<short> left, Vector64<short> right);
public static Vector64<uint> AbsoluteDifference(Vector64<uint> left, Vector64<uint> right);
public static Vector64<uint> AbsoluteDifference(Vector64<int> left, Vector64<int> right);
public static Vector64<float> AbsoluteDifference(Vector64<float> left, Vector64<float> right);
public static Vector128<byte> AbsoluteDifference(Vector128<byte> left, Vector128<byte> right);
public static Vector128<byte> AbsoluteDifference(Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<ushort> AbsoluteDifference(Vector128<short> left, Vector128<short> right);
public static Vector128<uint> AbsoluteDifference(Vector128<uint> left, Vector128<uint> right);
public static Vector128<uint> AbsoluteDifference(Vector128<int> left, Vector128<int> right);
public static Vector128<float> AbsoluteDifference(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector absolute difference add
///
/// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
///
/// Corresponds to vector forms of ARM64 SABA, UABA
/// </summary>
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<byte> addend, Vector64<byte> left, Vector64<byte> right);
public static Vector64<byte> AbsoluteDifferenceAdd(Vector64<sbyte> addend, Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<ushort> addend, Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<ushort> AbsoluteDifferenceAdd(Vector64<short> addend, Vector64<short> left, Vector64<short> right);
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<uint> addend, Vector64<uint> left, Vector64<uint> right);
public static Vector64<uint> AbsoluteDifferenceAdd(Vector64<int> addend, Vector64<int> left, Vector64<int> right);
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<byte> addend, Vector128<byte> left, Vector128<byte> right);
public static Vector128<byte> AbsoluteDifferenceAdd(Vector128<sbyte> addend, Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short> addend, Vector128<short> left, Vector128<short> right);
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<uint> addend, Vector128<uint> left, Vector128<uint> right);
public static Vector128<uint> AbsoluteDifferenceAdd(Vector128<int> addend, Vector128<int> left, Vector128<int> right);
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP, and FADDP
/// </summary>
public static Vector64<byte> AddPairwise(Vector64<byte> left, Vector64<byte> right) ;
public static Vector64<sbyte> AddPairwise(Vector64<sbyte> left, Vector64<sbyte> right) ;
public static Vector64<ushort> AddPairwise(Vector64<ushort> left, Vector64<ushort> right) ;
public static Vector64<short> AddPairwise(Vector64<short> left, Vector64<short> right) ;
public static Vector64<int> AddPairwise(Vector64<int> left, Vector64<int> right) ;
public static Vector64<uint> AddPairwise(Vector64<uint> left, Vector64<uint> right) ;
public static Vector64<float> AddPairwise(Vector64<float> left, Vector64<float> right) ;
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector64<byte> ExtractVector64(Vector64<byte> upper, Vector64<byte> lower, byte byteIndex);
public static Vector64<sbyte> ExtractVector64(Vector64<sbyte> upper, Vector64<sbyte> lower, byte byteIndex);
public static Vector64<short> ExtractVector64(Vector64<short> upper, Vector64<short> lower, byte byteIndex);
public static Vector64<ushort> ExtractVector64(Vector64<ushort> upper, Vector64<ushort> lower, byte byteIndex);
public static Vector64<int> ExtractVector64(Vector64<int> upper, Vector64<int> lower, byte byteIndex);
public static Vector64<uint> ExtractVector64(Vector64<uint> upper, Vector64<uint> lower, byte byteIndex);
public static Vector128<byte> ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
public static Vector128<sbyte> ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
public static Vector128<short> ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
public static Vector128<int> ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
public static Vector128<uint> ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
public static Vector128<long> ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
public static Vector128<ulong> ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
public static Vector128<float> ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector64<float> MaxNumber(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MaxNumber(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
/// </summary>
public static Vector64<byte> MaxPairwise(Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MaxPairwise(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MaxPairwise(Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MaxPairwise(Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MaxPairwise(Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MaxPairwise(Vector64<int> left, Vector64<int> right);
public static Vector64<float> MaxPairwise(Vector64<float> left, Vector64<float> right);
/// <summary>
/// Vector min numeric
/// Corresponds to vector forms of ARM64 FMINNM
/// </summary>
public static Vector64<float> MinNumber(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MinNumber(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector min pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
/// </summary>
public static Vector64<byte> MinPairwise(Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MinPairwise(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MinPairwise(Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MinPairwise(Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MinPairwise(Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MinPairwise(Vector64<int> left, Vector64<int> right);
public static Vector64<float> MinPairwise(Vector64<float> left, Vector64<float> right);
/// <summary>
/// Vector multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAdd(Vector64<byte> addend, Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MultiplyAdd(Vector64<sbyte> addend, Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MultiplyAdd(Vector64<ushort> addend, Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MultiplyAdd(Vector64<short> addend, Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MultiplyAdd(Vector64<uint> addend, Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MultiplyAdd(Vector64<int> addend, Vector64<int> left, Vector64<int> right);
public static Vector128<byte> MultiplyAdd(Vector128<byte> addend, Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> MultiplyAdd(Vector128<sbyte> addend, Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> MultiplyAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> MultiplyAdd(Vector128<short> addend, Vector128<short> left, Vector128<short> right);
public static Vector128<uint> MultiplyAdd(Vector128<uint> addend, Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> MultiplyAdd(Vector128<int> addend, Vector128<int> left, Vector128<int> right);
/// <summary>
/// Vector multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAddBySelectedScalar(Vector64<byte> addend, Vector64<byte> left, Vector64<byte> right, byte rightIndex);
public static Vector64<sbyte> MultiplyAddBySelectedScalar(Vector64<sbyte> addend, Vector64<sbyte> left, Vector64<sbyte> right, byte rightIndex);
public static Vector64<ushort> MultiplyAddBySelectedScalar(Vector64<ushort> addend, Vector64<ushort> left, Vector64<ushort> right, byte rightIndex);
public static Vector64<short> MultiplyAddBySelectedScalar(Vector64<short> addend, Vector64<short> left, Vector64<short> right, byte rightIndex);
public static Vector64<uint> MultiplyAddBySelectedScalar(Vector64<uint> addend, Vector64<uint> left, Vector64<uint> right, byte rightIndex);
public static Vector64<int> MultiplyAddBySelectedScalar(Vector64<int> addend, Vector64<int> left, Vector64<int> right, byte rightIndex);
public static Vector128<byte> MultiplyAddBySelectedScalar(Vector128<byte> addend, Vector128<byte> left, Vector128<byte> right, byte rightIndex);
public static Vector128<sbyte> MultiplyAddBySelectedScalar(Vector128<sbyte> addend, Vector128<sbyte> left, Vector128<sbyte> right, byte rightIndex);
public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
public static Vector128<short> MultiplyAddBySelectedScalar(Vector128<short> addend, Vector128<short> left, Vector128<short> right, byte rightIndex);
public static Vector128<uint> MultiplyAddBySelectedScalar(Vector128<uint> addend, Vector128<uint> left, Vector128<uint> right, byte rightIndex);
public static Vector128<int> MultiplyAddBySelectedScalar(Vector128<int> addend, Vector128<int> left, Vector128<int> right, byte rightIndex);
/// <summary>
/// Vector multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtract(Vector64<byte> minuend, Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> MultiplySubtract(Vector64<sbyte> minuend, Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector64<ushort> MultiplySubtract(Vector64<ushort> minuend, Vector64<ushort> left, Vector64<ushort> right);
public static Vector64<short> MultiplySubtract(Vector64<short> minuend, Vector64<short> left, Vector64<short> right);
public static Vector64<uint> MultiplySubtract(Vector64<uint> minuend, Vector64<uint> left, Vector64<uint> right);
public static Vector64<int> MultiplySubtract(Vector64<int> minuend, Vector64<int> left, Vector64<int> right);
public static Vector128<byte> MultiplySubtract(Vector128<byte> minuend, Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> MultiplySubtract(Vector128<sbyte> minuend, Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> MultiplySubtract(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> MultiplySubtract(Vector128<short> minuend, Vector128<short> left, Vector128<short> right);
public static Vector128<uint> MultiplySubtract(Vector128<uint> minuend, Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> MultiplySubtract(Vector128<int> minuend, Vector128<int> left, Vector128<int> right);
/// <summary>
/// Vector multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtractBySelectedScalar(Vector64<byte> minuend, Vector64<byte> left, Vector64<byte> right, byte rightIndex);
public static Vector64<sbyte> MultiplySubtractBySelectedScalar(Vector64<sbyte> minuend, Vector64<sbyte> left, Vector64<sbyte> right, byte rightIndex);
public static Vector64<ushort> MultiplySubtractBySelectedScalar(Vector64<ushort> minuend, Vector64<ushort> left, Vector64<ushort> right, byte rightIndex);
public static Vector64<short> MultiplySubtractBySelectedScalar(Vector64<short> minuend, Vector64<short> left, Vector64<short> right, byte rightIndex);
public static Vector64<uint> MultiplySubtractBySelectedScalar(Vector64<uint> minuend, Vector64<uint> left, Vector64<uint> right, byte rightIndex);
public static Vector64<int> MultiplySubtractBySelectedScalar(Vector64<int> minuend, Vector64<int> left, Vector64<int> right, byte rightIndex);
public static Vector128<byte> MultiplySubtractBySelectedScalar(Vector128<byte> minuend, Vector128<byte> left, Vector128<byte> right, byte rightIndex);
public static Vector128<sbyte> MultiplySubtractBySelectedScalar(Vector128<sbyte> minuend, Vector128<sbyte> left, Vector128<sbyte> right, byte rightIndex);
public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
public static Vector128<short> MultiplySubtractBySelectedScalar(Vector128<short> minuend, Vector128<short> left, Vector128<short> right, byte rightIndex);
public static Vector128<uint> MultiplySubtractBySelectedScalar(Vector128<uint> minuend, Vector128<uint> left, Vector128<uint> right, byte rightIndex);
public static Vector128<int> MultiplySubtractBySelectedScalar(Vector128<int> minuend, Vector128<int> left, Vector128<int> right, byte rightIndex);
/// <summary>
/// Vector fused multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector64<float> FusedMultiplyAdd(Vector64<float> addend, Vector64<float> left, Vector64<float> right);
public static Vector128<float> FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector fused multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector64<float> FusedMultiplySubtract(Vector64<float> minuend, Vector64<float> left, Vector64<float> right);
public static Vector128<float> FusedMultiplySubtract(Vector128<float> minuend, Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector polynomial multiply
/// Corresponds to vector forms of ARM64 PMUL
/// </summary>
public static Vector64<byte> PolynomialMultiply(Vector64<byte> left, Vector64<byte> right);
public static Vector64<sbyte> PolynomialMultiply(Vector64<sbyte> left, Vector64<sbyte> right);
public static Vector128<byte> PolynomialMultiply(Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> PolynomialMultiply(Vector128<sbyte> left, Vector128<sbyte> right);
/// Vector reciprocal estimate
///
/// See FRECPE docs
///
/// Corresponds to vector forms of ARM64 FRECPE
/// </summary>
public static Vector64<float> ReciprocalEstimate(Vector64<float> value);
public static Vector128<float> ReciprocalEstimate(Vector128<float> value);
/// <summary>
/// Vector reciprocal step
///
/// See FRECPS docs
///
/// Corresponds to vector forms of ARM64 FRECPS
/// </summary>
public static Vector64<float> ReciprocalStep(Vector64<float> left, Vector64<float> right);
public static Vector128<float> ReciprocalStep(Vector128<float> left, Vector128<float> right);
/// <summary>
/// Vector reciprocal square root estimate
///
/// See FRSQRTE docs
///
/// Corresponds to vector forms of ARM64 FRSQRTE
/// </summary>
public static Vector64<float> ReciprocalSquareRootEstimate(Vector64<float> value);
public static Vector128<float> ReciprocalSquareRootEstimate(Vector128<float> value);
/// <summary>
/// Vector reciprocal square root step
///
/// See FRSQRTS docs
///
/// Corresponds to vector forms of ARM64 FRSQRTS
/// </summary>
public static Vector64<float> ReciprocalSquareRootStep(Vector64<float> left, Vector64<float> right);
public static Vector128<float> ReciprocalSquareRootStep(Vector128<float> left, Vector128<float> right);
public partial class Arm64
{
/// <summary>
/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector CompareGreaterThan
///
/// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
///
/// Corresponds to vector forms of ARM64 FACGT
/// </summary>
public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector absolute difference
/// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
/// </summary>
public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector add pairwise
/// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
/// Corresponds to vector forms of ARM64 ADDP, and FADDP
/// </summary>
public static Vector128<byte> AddPairwise(Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> AddPairwise(Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> AddPairwise(Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> AddPairwise(Vector128<short> left, Vector128<short> right);
public static Vector128<uint> AddPairwise(Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> AddPairwise(Vector128<int> left, Vector128<int> right);
public static Vector128<long> AddPairwise(Vector128<long> left, Vector128<long> right);
public static Vector128<ulong> AddPairwise(Vector128<ulong> left, Vector128<ulong> right);
public static Vector128<float> AddPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> AddPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector128<double> ExtractVector(Vector128<double> left, Vector128<double> right, byte index);
/// <summary>
/// Vector add across vector elements
/// Corresponds to vector forms of ARM64 ADDV
/// </summary>
public static Vector64<byte> AddAcross(Vector64<byte> value);
public static Vector64<sbyte> AddAcross(Vector64<sbyte> value);
public static Vector64<ushort> AddAcross(Vector64<ushort> value);
public static Vector64<short> AddAcross(Vector64<short> value);
public static Vector64<uint> AddAcross(Vector64<uint> value);
public static Vector64<int> AddAcross(Vector64<int> value);
public static Vector64<byte> AddAcross(Vector128<byte> value);
public static Vector64<sbyte> AddAcross(Vector128<sbyte> value);
public static Vector64<ushort> AddAcross(Vector128<ushort> value);
public static Vector64<short> AddAcross(Vector128<short> value);
public static Vector64<uint> AddAcross(Vector128<uint> value);
public static Vector64<int> AddAcross(Vector128<int> value);
/// <summary>
/// Vector max numeric
/// Corresponds to vector forms of ARM64 FMAXNM
/// </summary>
public static Vector128<double> MaxNumber(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector max numeric pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 FMAXNMP
/// </summary>
public static Vector64<float> MaxNumberPairwise(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MaxNumberPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> MaxNumberPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector max numeric across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 FMAXNMV
/// </summary>
public static Vector64<float> MaxNumberAcross(Vector128<float> value);
/// <summary>
/// Vector max pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
/// </summary>
public static Vector128<byte> MaxPairwise(Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> MaxPairwise(Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> MaxPairwise(Vector128<short> left, Vector128<short> right);
public static Vector128<uint> MaxPairwise(Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> MaxPairwise(Vector128<int> left, Vector128<int> right);
public static Vector128<float> MaxPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector max across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 SMAXV, UMAXV, and FMAXV
/// </summary>
public static Vector64<byte> MaxAcross(Vector64<byte> value);
public static Vector64<sbyte> MaxAcross(Vector64<sbyte> value);
public static Vector64<ushort> MaxAcross(Vector64<ushort> value);
public static Vector64<short> MaxAcross(Vector64<short> value);
public static Vector64<uint> MaxAcross(Vector64<uint> value);
public static Vector64<int> MaxAcross(Vector64<int> value);
public static Vector64<float> MaxAcross(Vector64<float> value);
public static Vector64<byte> MaxAcross(Vector128<byte> value);
public static Vector64<sbyte> MaxAcross(Vector128<sbyte> value);
public static Vector64<ushort> MaxAcross(Vector128<ushort> value);
public static Vector64<short> MaxAcross(Vector128<short> value);
public static Vector64<uint> MaxAcross(Vector128<uint> value);
public static Vector64<int> MaxAcross(Vector128<int> value);
public static Vector64<ulong> MaxAcross(Vector128<ulong> value);
public static Vector64<long> MaxAcross(Vector128<long> value);
public static Vector64<float> MaxAcross(Vector128<float> value);
public static Vector64<double> MaxAcross(Vector128<double> value);
// -------------------------------------------------
// Reviewed today:
// -------------------------------------------------
/// <summary>
/// Vector min numeric
/// Corresponds to vector forms of ARM64 FMINNM
/// </summary>
public static Vector128<double> MinNumber(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector min numeric pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 FMINNMP
/// </summary>
public static Vector64<float> MinNumberPairwise(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MinNumberPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> MinNumberPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector min numeric across
///
/// result = min(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 FMINNMV
/// </summary>
public static float MinNumberAcross(Vector128<float> value);
/// <summary>
/// Vector min pairwise
///
/// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
///
/// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
/// </summary>
public static Vector128<byte> MinPairwise(Vector128<byte> left, Vector128<byte> right);
public static Vector128<sbyte> MinPairwise(Vector128<sbyte> left, Vector128<sbyte> right);
public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right);
public static Vector128<short> MinPairwise(Vector128<short> left, Vector128<short> right);
public static Vector128<uint> MinPairwise(Vector128<uint> left, Vector128<uint> right);
public static Vector128<int> MinPairwise(Vector128<int> left, Vector128<int> right);
public static Vector128<float> MinPairwise(Vector128<float> left, Vector128<float> right);
public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector min across
///
/// result = max(value[0], ... , value[length -1])
///
/// Corresponds to vector forms of ARM64 SMINV, UMINV, and FMINV
/// </summary>
public static Vector64<byte> MinAcross(Vector64<byte> value);
public static Vector64<sbyte> MinAcross(Vector64<sbyte> value);
public static Vector64<ushort> MinAcross(Vector64<ushort> value);
public static Vector64<short> MinAcross(Vector64<short> value);
public static Vector64<uint> MinAcross(Vector64<uint> value);
public static Vector64<int> MinAcross(Vector64<int> value);
public static Vector64<float> MinAcross(Vector64<float> value);
public static Vector64<byte> MinAcross(Vector128<byte> value);
public static Vector64<sbyte> MinAcross(Vector128<sbyte> value);
public static Vector64<ushort> MinAcross(Vector128<ushort> value);
public static Vector64<short> MinAcross(Vector128<short> value);
public static Vector64<uint> MinAcross(Vector128<uint> value);
public static Vector64<int> MinAcross(Vector128<int> value);
public static Vector64<float> MinAcross(Vector128<float> value);
public static Vector64<double> MinAcross(Vector128<double> value);
/// <summary>
/// Vector fused multiply add
///
/// For each element result[elem] = acc[elem] + left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector128<double> FusedMultiplyAdd(Vector128<double> addend, Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector fused multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 FMLA
/// </summary>
public static Vector64<float> FusedMultiplyAdd(Vector64<float> addend, Vector64<float> left, float right);
public static Vector128<float> FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, float right);
public static Vector128<float> FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, float right);
/// <summary>
/// Vector fused multiply subtract
///
/// For each element result[elem] = acc[elem] - left[elem] * right[elem]
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector128<double> FusedMultiplySubtract(Vector128<double> minuend, Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector fused multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 FMLS
/// </summary>
public static Vector64<float> FusedMultiplySubtract(Vector64<float> minuend, Vector64<float> left, float right);
public static Vector128<float> FusedMultiplySubtract(Vector128<float> minuend, Vector128<float> left, float right);
public static Vector128<double> FusedMultiplySubtract(Vector128<double> minuend, Vector128<double> left, float right);
/// <summary>
/// Vector multiply extend
///
/// For each element result[elem] = left[elem] * right[elem]
/// Handle extend special cases zero and infinite. FMULX
///
/// Corresponds to vector forms of ARM64 FMULX
/// </summary>
public static Vector64<float> MultiplyExtended(Vector64<float> left, Vector64<float> right);
public static Vector128<float> MultiplyExtended(Vector128<float> left, Vector128<float> right);
public static Vector128<double> MultiplyExtended(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector multiply extend by element
///
/// For each element result[elem] = left[elem] * right
/// Handle extend special cases zero and infinite. FMULX
///
/// Corresponds to vector forms of ARM64 FMULX
/// </summary>
public static Vector64<float> MultiplyExtendedBySelectedScalar(Vector64<float> left, Vector64<float> right, byte rightIndex);
public static Vector128<float> MultiplyExtendedBySelectedScalar(Vector128<float> left, Vector128<float> right, byte rightIndex);
public static Vector128<double> MultiplyExtendedBySelectedScalar(Vector128<double> left, Vector128<double> right, byte rightIndex);
/// Vector reciprocal estimate
///
/// See FRECPE docs
///
/// Corresponds to vector forms of ARM64 FRECPE
/// </summary>
public static Vector128<double> ReciprocalEstimate(Vector128<double> value);
/// <summary>
/// Vector reciprocal step
///
/// See FRECPS docs
///
/// Corresponds to vector forms of ARM64 FRECPS
/// </summary>
public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector reciprocal square root estimate
///
/// See FRSQRTE docs
///
/// Corresponds to vector forms of ARM64 FRSQRTE
/// </summary>
public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value);
/// <summary>
/// Vector reciprocal square root step
///
/// See FRSQRTS docs
///
/// Corresponds to vector forms of ARM64 FRSQRTS
/// </summary>
public static Vector128<double> ReciprocalSquareRootStep(Vector128<double> left, Vector128<double> right);
/// <summary>
/// Vector reverse byte bits
/// Corresponds to vector forms of ARM64 RBIT
/// </summary>
public static Vector64<byte> ReverseElementBits(Vector64<byte> value);
public static Vector64<sbyte> ReverseElementBits(Vector64<sbyte> value);
public static Vector128<byte> ReverseElementBits(Vector128<byte> value);
public static Vector128<sbyte> ReverseElementBits(Vector128<sbyte> value);
}
}
}
```
I think we should re-consider API design for ExtractVector64
and ExtractVector128
- instead of
1) forbidding floating point types and
2) specifying a byteIndex
we should follow the approach that C++ intrinsics take - specify elementIndex
and have JIT to convert this element index to byte index immediate - this way we are not gonna get de-normalized floating point value as a result and we will be on parity with the C++ implementations.
For example, for ExtractVector64(upper, lower, 1)
where upper
and lower
are Vector64<float>
JIT will emit EXT Vd.8B, Vn.8B, Vm.8B, 4
I don't like the approach of taking an elementIndex
. That artificially limits the usage of the intrinsics and prevents you from extracting an arbitrary 64-bit sequence.
There are separate intrinsics for extracting individual elements from a vector: https://github.com/dotnet/runtime/issues/24588 and users wanting to work with float can just use the zero cost reinterpret cast APIs (.AsInt32
and .AsSingle
), which will force them to rationalize the denormal scenario and take it into consideration.
Another question concerning FusedMultiplyAddBySelectedScalar
and FusedMultiplySubtractBySelectedScalar
.
In C++ there are exist
float32x2_t vfma_lane_f32 (float32x2_t a, float32x2_t b, float32x2_t v, const int lane)
and
float32x2_t vfma_laneq_f32 (float32x2_t a, float32x2_t b, float32x4_t v, const int lane)
.
Shouldn't we follow the same approach, i.e. have multiple overloads such as :
public static Vector64<float> FusedMultiplyAddBySelectedScalar(Vector64<float> addend, Vector64<float> left, Vector64<float> right, byte rightIndex);
public static Vector64<float> FusedMultiplyAddBySelectedScalar(Vector64<float> addend, Vector64<float> left, Vector128<float> right, byte rightIndex);
Alternatively, we can can have right
to be Vector128<T>
no matter size of addend
and left
and upcast Vector64<T>
to Vector128<T>
if needed
The current design, however, when we match the sizes of left
,right
and addend
doesn't seem to be practical.
I don't like the approach of taking an
elementIndex
. That artificially limits the usage of the intrinsics and prevents you from extracting an arbitrary 64-bit sequence.There are separate intrinsics for extracting individual elements from a vector: #24588 and users wanting to work with float can just use the zero cost reinterpret cast APIs (
.AsInt32
and.AsSingle
), which will force them to rationalize the denormal scenario and take it into consideration.
If a user wants to extract an arbitrary 8/16-bytes sequence why not convert both operands to Vector64/128As<byte>()
?
Shouldn't we follow the same approach, i.e. have multiple overloads such as
The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.
It doesn't look like the underlying instruction encoding requires they all be the same size and so I would guess this is done because C++ doesn't have an easy way to convert from V128<T>
to V64<T>
.
Maybe @TamarChristinaArm has a better idea of why the split exists?
If a user wants to extract an arbitrary 8/16-bytes sequence why not convert both operands to Vector64/128 using As
()?
They could also do that, but the underlying instruction actually operates on byteIndex
and we have tended away from adding abstractions of the instructions so far.
They could also do that, but the underlying instruction actually operates on
byteIndex
and we have tended away from adding abstractions of the instructions so far.
Well, this is true on Arm64
for EXT <Vd>.<T>, <Vn>.<T>, <Vm>.<T>, #<index>
where
<index>
is indeed a byte index. By the way, <T>
can only be 8B
or 16B
that kind of suggests you are working on byte sequences.
However, on Arm32 VEXT (multibyte elements) VEXT.<size> {<Dd>,} <Dn>, <Dm>, #<imm>
is a pseudo-instruction that translates by assembler to VEXT (byte elements) VEXT.8 {<Dd>,} <Dn>, <Dm>, #<imm*(size/8)>
, i.e. #<imm>
is an element index.
Both instructions (VEXT
on ARM32 and EXT
on ARM64) operate identically. They are similar to orr
or other logical operations. That is, the instruction encoding only takes 8B
/16B
but it isn't doing something that is logically byte only and will be frequently used for non byte operations.
We can always create an issue and re-discuss ExtractVector64
/ExtractVector128
again on Tuesday, bringing up the C++ difference and whether having it operate on element by default is better (with requesting users to downcast if they want byte sequences instead).
Shouldn't we follow the same approach, i.e. have multiple overloads such as
The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.It doesn't look like the underlying instruction encoding requires they all be the same size and so I would guess this is done because C++ doesn't have an easy way to convert from
V128<T>
toV64<T>
.
Maybe @TamarChristinaArm has a better idea of why the split exists?
For such intrinsics the split is always 4 ways. the location of the q
in the name denotes which components are 128
bits.
So for e.g. the float
case we have
vfmaq_lane_f32
vfma_laneq_f32
vfmaq_laneq_f32
precisely because as you said the instruction doesn't require them to all be the same size.
This convention (partially) holds for newer ISAs such as MVE
[1] and SVE
[2] as well, though in those cases we also have completely overload driven instances as well. e.g. svmla_lane
for SVE will do the normal overloading you would expect in C++ (and in C using C11's _Generic
extension).
We do have a way to convert from V128<T>
to V64<T>
i.e. vget_low
but they're not zero cost abstractions. So we prefer to provide the overloads.
[1] https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
[2] https://static.docs.arm.com/100987/0000/acle_sve_100987_0000_00_en.pdf
The following APIs are still to be implemented:
namespace System.Runtime.Intrinsics.Arm
{
public static class AdvSimd
{
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector64<byte> ExtractVector64(Vector64<byte> upper, Vector64<byte> lower, byte byteIndex);
public static Vector64<sbyte> ExtractVector64(Vector64<sbyte> upper, Vector64<sbyte> lower, byte byteIndex);
public static Vector64<short> ExtractVector64(Vector64<short> upper, Vector64<short> lower, byte byteIndex);
public static Vector64<ushort> ExtractVector64(Vector64<ushort> upper, Vector64<ushort> lower, byte byteIndex);
public static Vector64<int> ExtractVector64(Vector64<int> upper, Vector64<int> lower, byte byteIndex);
public static Vector64<uint> ExtractVector64(Vector64<uint> upper, Vector64<uint> lower, byte byteIndex);
public static Vector128<byte> ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
public static Vector128<sbyte> ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
public static Vector128<short> ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
public static Vector128<int> ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
public static Vector128<uint> ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
public static Vector128<long> ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
public static Vector128<ulong> ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
public static Vector128<float> ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);
/// <summary>
/// Vector multiply add by element
///
/// For each element result[elem] = acc[elem] + left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLA
/// </summary>
public static Vector64<byte> MultiplyAddBySelectedScalar(Vector64<byte> addend, Vector64<byte> left, Vector64<byte> right, byte rightIndex);
public static Vector64<sbyte> MultiplyAddBySelectedScalar(Vector64<sbyte> addend, Vector64<sbyte> left, Vector64<sbyte> right, byte rightIndex);
public static Vector64<ushort> MultiplyAddBySelectedScalar(Vector64<ushort> addend, Vector64<ushort> left, Vector64<ushort> right, byte rightIndex);
public static Vector64<short> MultiplyAddBySelectedScalar(Vector64<short> addend, Vector64<short> left, Vector64<short> right, byte rightIndex);
public static Vector64<uint> MultiplyAddBySelectedScalar(Vector64<uint> addend, Vector64<uint> left, Vector64<uint> right, byte rightIndex);
public static Vector64<int> MultiplyAddBySelectedScalar(Vector64<int> addend, Vector64<int> left, Vector64<int> right, byte rightIndex);
public static Vector128<byte> MultiplyAddBySelectedScalar(Vector128<byte> addend, Vector128<byte> left, Vector128<byte> right, byte rightIndex);
public static Vector128<sbyte> MultiplyAddBySelectedScalar(Vector128<sbyte> addend, Vector128<sbyte> left, Vector128<sbyte> right, byte rightIndex);
public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
public static Vector128<short> MultiplyAddBySelectedScalar(Vector128<short> addend, Vector128<short> left, Vector128<short> right, byte rightIndex);
public static Vector128<uint> MultiplyAddBySelectedScalar(Vector128<uint> addend, Vector128<uint> left, Vector128<uint> right, byte rightIndex);
public static Vector128<int> MultiplyAddBySelectedScalar(Vector128<int> addend, Vector128<int> left, Vector128<int> right, byte rightIndex);
/// <summary>
/// Vector multiply subtract by element
///
/// For each element result[elem] = acc[elem] - left[elem] * right
///
/// Corresponds to vector forms of ARM64 MLS
/// </summary>
public static Vector64<byte> MultiplySubtractBySelectedScalar(Vector64<byte> minuend, Vector64<byte> left, Vector64<byte> right, byte rightIndex);
public static Vector64<sbyte> MultiplySubtractBySelectedScalar(Vector64<sbyte> minuend, Vector64<sbyte> left, Vector64<sbyte> right, byte rightIndex);
public static Vector64<ushort> MultiplySubtractBySelectedScalar(Vector64<ushort> minuend, Vector64<ushort> left, Vector64<ushort> right, byte rightIndex);
public static Vector64<short> MultiplySubtractBySelectedScalar(Vector64<short> minuend, Vector64<short> left, Vector64<short> right, byte rightIndex);
public static Vector64<uint> MultiplySubtractBySelectedScalar(Vector64<uint> minuend, Vector64<uint> left, Vector64<uint> right, byte rightIndex);
public static Vector64<int> MultiplySubtractBySelectedScalar(Vector64<int> minuend, Vector64<int> left, Vector64<int> right, byte rightIndex);
public static Vector128<byte> MultiplySubtractBySelectedScalar(Vector128<byte> minuend, Vector128<byte> left, Vector128<byte> right, byte rightIndex);
public static Vector128<sbyte> MultiplySubtractBySelectedScalar(Vector128<sbyte> minuend, Vector128<sbyte> left, Vector128<sbyte> right, byte rightIndex);
public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
public static Vector128<short> MultiplySubtractBySelectedScalar(Vector128<short> minuend, Vector128<short> left, Vector128<short> right, byte rightIndex);
public static Vector128<uint> MultiplySubtractBySelectedScalar(Vector128<uint> minuend, Vector128<uint> left, Vector128<uint> right, byte rightIndex);
public static Vector128<int> MultiplySubtractBySelectedScalar(Vector128<int> minuend, Vector128<int> left, Vector128<int> right, byte rightIndex);
public static class Arm64
{
/// <summary>
/// Vector extract from pair of vectors
/// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
///
/// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
///
/// Corresponds to vector forms of ARM64 EXT
/// </summary>
public static Vector128<double> ExtractVector128(Vector128<double> left, Vector128<double> right, byte index);
/// <summary>
/// Vector multiply extend by element
///
/// For each element result[elem] = left[elem] * right
/// Handle extend special cases zero and infinite. FMULX
///
/// Corresponds to vector forms of ARM64 FMULX
/// </summary>
public static Vector64<float> MultiplyExtendedBySelectedScalar(Vector64<float> left, Vector64<float> right, byte rightIndex);
public static Vector128<float> MultiplyExtendedBySelectedScalar(Vector128<float> left, Vector128<float> right, byte rightIndex);
public static Vector128<double> MultiplyExtendedBySelectedScalar(Vector128<double> left, Vector128<double> right, byte rightIndex);
}
}
}
The following APIs still need to be investigated and brought back for review (I will open a new issue for them):
namespace System.Runtime.Intrinsics.Arm
{
public static class AdvSimd
{
/// <summary>
/// Vector reverse element bytes
/// Corresponds to vector forms of ARM64 REV16, REV32, REV64
/// </summary>
public static Vector64<ushort> ReverseElementBytes(Vector64<ushort> value) { throw null; }
public static Vector64<short> ReverseElementBytes(Vector64<short> value) { throw null; }
public static Vector64<uint> ReverseElementBytes(Vector64<uint> value) { throw null; }
public static Vector64<int> ReverseElementBytes(Vector64<int> value) { throw null; }
public static Vector64<float> ReverseElementBytes(Vector64<float> value) { throw null; }
public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
public static Vector128<short> ReverseElementBytes(Vector128<short> value) { throw null; }
public static Vector128<uint> ReverseElementBytes(Vector128<uint> value) { throw null; }
public static Vector128<int> ReverseElementBytes(Vector128<int> value) { throw null; }
public static Vector128<ulong> ReverseElementBytes(Vector128<ulong> value) { throw null; }
public static Vector128<long> ReverseElementBytes(Vector128<long> value) { throw null; }
public static Vector128<float> ReverseElementBytes(Vector128<float> value) { throw null; }
}
}
The following APIs are still to be implemented:
@tannergooding Also Fused_MLA/MLS_BySelectedScalar
And we need to add/propose MultiplyBySelectedScalar - mul
also has by element form.
Also Fused_MLA/MLS_BySelectedScalar
And we need to add/propose MultiplyBySelectedScalar - mul also has by element form.
The former haven't been proposed yet either (I don't see them listed anywhere above). I'm adding them to https://github.com/dotnet/runtime/issues/33683
Most helpful comment
I can modify the original comment on any of these and you shouldn't need to worry about it 馃槃
I can also handle ensuring this gets a slot on the API review schedule, etc.