Runtime: API Proposal: More SIMD HW Intrinsics

Created on 25 Jan 2018 · 77Comments · Source: dotnet/runtime

@tannergooding: Updated according to match https://github.com/dotnet/corefx/issues/26581#issuecomment-539217015. Previous version is available in comment history.

```C#
namespace System.Runtime.Intrinsics.Arm
{
public static class AdvSimd
{
public static bool IsSupported { get { throw null; } }

    /// <summary>
    /// Vector CompareGreaterThanOrEqual
    /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
    /// Corresponds to vector forms of ARM64 FACGE
    /// </summary>
    public static Vector64<float>   AbsoluteCompareGreaterThanOrEqual(Vector64<float>   left, Vector64<float>   right) { throw null; }
    public static Vector128<float>  AbsoluteCompareGreaterThanOrEqual(Vector128<float>  left, Vector128<float>  right) { throw null; }

    /// <summary>
    /// Vector CompareGreaterThan
    ///
    /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
    ///
    /// Corresponds to vector forms of ARM64 FACGT
    /// </summary>
    public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right) { throw null; }
    public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right) { throw null; }

    /// <summary>
    /// Vector absolute difference
    /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
    /// </summary>
    public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
    public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
    public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
    public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right) { throw null; }
    public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
    public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right) { throw null; }
    public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right) { throw null; }
    public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
    public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
    public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
    public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right) { throw null; }
    public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
    public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right) { throw null; }
    public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right) { throw null; }

    /// <summary>
    /// Vector absolute difference add
    ///
    /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
    ///
    /// Corresponds to vector forms of ARM64 SABA, UABA
    /// </summary>
    public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
    public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
    public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
    public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
    public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
    public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
    public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
    public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
    public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
    public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
    public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
    public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

    /// <summary>
    /// Vector add pairwise
    /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
    /// Corresponds to vector forms of ARM64 ADDP & FADDP
    /// </summary>
    public static Vector64<byte>   AddPairwise<byte>(Vector64<byte>  left, Vector64<byte>  right)  { throw null; }
    public static Vector64<sbyte>  AddPairwise<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right)  { throw null; }
    public static Vector64<ushort> AddPairwise<ushort>(Vector64<ushort>  left, Vector64<ushort>  right)  { throw null; }
    public static Vector64<short>  AddPairwise<short>(Vector64<short>  left, Vector64<short>  right)  { throw null; }
    public static Vector64<int>    AddPairwise<int>(Vector64<int>  left, Vector64<int>  right)  { throw null; }
    public static Vector64<uint>   AddPairwise<uint>(Vector64<uint>  left, Vector64<uint>  right)  { throw null; }
    public static Vector64<float>  AddPairwise<float>(Vector64<float>  left, Vector64<float>  right)  { throw null; }

    /// <summary>
    /// Vector extract from pair of vectors
    /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
    ///
    /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
    ///
    /// Corresponds to vector forms of ARM64 EXT
    /// </summary>
    public static Vector64<byte>   ExtractVector<byte>(Vector64<byte>  left, Vector64<byte>  right, byte index) { throw null; }
    public static Vector64<sbyte>  ExtractVector<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right, byte index) { throw null; }
    public static Vector64<short>  ExtractVector<short>(Vector64<short>  left, Vector64<short>  right, byte index) { throw null; }
    public static Vector64<ushort> ExtractVector<ushort>(Vector64<ushort>  left, Vector64<ushort>  right, byte index) { throw null; }
    public static Vector64<int>    ExtractVector<int>(Vector64<int>  left, Vector64<int>  right, byte index) { throw null; }
    public static Vector64<uint>   ExtractVector<uint>(Vector64<uint>  left, Vector64<uint>  right, byte index) { throw null; }

    public static Vector128<byte>   ExtractVector<byte>(Vector128<byte> left, Vector128<byte> right, byte index) { throw null; }
    public static Vector128<sbyte>  ExtractVector<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right, byte index) { throw null; }
    public static Vector128<short>  ExtractVector<short>(Vector128<short> left, Vector128<short> right, byte index) { throw null; }
    public static Vector128<ushort> ExtractVector<ushort>(Vector128<ushort> left, Vector128<ushort> right, byte index) { throw null; }
    public static Vector128<int>    ExtractVector<int>(Vector128<int> left, Vector128<int> right, byte index) { throw null; }
    public static Vector128<uint>   ExtractVector<uint>(Vector128<uint> left, Vector128<uint> right, byte index) { throw null; }
    public static Vector128<long>   ExtractVector<long>(Vector128<long> left, Vector128<long> right, byte index) { throw null; }
    public static Vector128<ulong>  ExtractVector<ulong>(Vector128<ulong> left, Vector128<ulong> right, byte index) { throw null; }
    public static Vector128<float>  ExtractVector<double>(Vector128<float> left, Vector128<float> right, byte index) { throw null; }

    /// <summary>
    /// Vector max numeric
    /// Corresponds to vector forms of ARM64 FMAXNM
    /// </summary>
    public static Vector64<float>   MaxNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
    public static Vector128<float>  MaxNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

    /// <summary>
    /// Vector max pairwise
    ///
    /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
    ///
    /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
    /// </summary>
    public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
    public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
    public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
    public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
    public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
    public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
    public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

    /// <summary>
    /// Vector min numeric
    /// Corresponds to vector forms of ARM64 FMINNM
    /// </summary>
    public static Vector64<float>   MinNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
    public static Vector128<float>  MinNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

    /// <summary>
    /// Vector min pairwise
    ///
    /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
    ///
    /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
    /// </summary>
    public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
    public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
    public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
    public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
    public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
    public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
    public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

    /// <summary>
    /// Vector multiply add
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 MLA
    /// </summary>
    public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
    public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
    public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
    public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
    public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
    public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
    public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
    public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
    public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
    public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
    public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
    public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

    /// <summary>
    /// Vector multiply add by element
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right
    ///
    /// Corresponds to vector forms of ARM64 MLA
    /// </summary>
    public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
    public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
    public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
    public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
    public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
    public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
    public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
    public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
    public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
    public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
    public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
    public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

    /// <summary>
    /// Vector multiply subtract
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 MLS
    /// </summary>
    public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
    public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
    public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
    public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
    public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
    public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
    public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
    public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
    public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
    public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
    public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
    public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

    /// <summary>
    /// Vector multiply subtract by element
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right
    ///
    /// Corresponds to vector forms of ARM64 MLS
    /// </summary>
    public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
    public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
    public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
    public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
    public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
    public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
    public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
    public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
    public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
    public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
    public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
    public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

    /// <summary>
    /// Vector fused multiply add
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 FMLA
    /// </summary>
    public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
    public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

    /// <summary>
    /// Vector fused multiply subtract
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 FMLS
    /// </summary>
    public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
    public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }


    /// <summary>
    /// Vector polynomial multiply
    /// Corresponds to vector forms of ARM64 PMUL
    /// </summary>
    public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
    public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
    public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
    public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }

    /// Vector reciprocal estimate
    ///
    /// See FRECPE docs
    ///
    /// Corresponds to vector forms of ARM64 FRECPE
    /// </summary>
    public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value) { throw null; }
    public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value) { throw null; }

    /// <summary>
    /// Vector reciprocal step
    ///
    /// See FRECPS docs
    ///
    /// Corresponds to vector forms of ARM64 FRECPS
    /// </summary>
    public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
    public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

    /// <summary>
    /// Vector reciprocal square root estimate
    ///
    /// See FRSQRTE docs
    ///
    /// Corresponds to vector forms of ARM64 FRSQRTE
    /// </summary>
    public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value) { throw null; }
    public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value) { throw null; }

    /// <summary>
    /// Vector reciprocal square root step
    ///
    /// See FRSQRTS docs
    ///
    /// Corresponds to vector forms of ARM64 FRSQRTS
    /// </summary>
    public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
    public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

    /// <summary>
    /// Vector reverse element bytes
    /// Corresponds to vector forms of ARM64 REV16, REV32, REV64
    /// </summary>
    public static Vector64<ushort>  ReverseElementBytes(Vector64<ushort>  value) { throw null; }
    public static Vector64<short>   ReverseElementBytes(Vector64<short>   value) { throw null; }
    public static Vector64<uint>    ReverseElementBytes(Vector64<uint>    value) { throw null; }
    public static Vector64<int>     ReverseElementBytes(Vector64<int>     value) { throw null; }
    public static Vector64<float>   ReverseElementBytes(Vector64<float>   value) { throw null; }
    public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
    public static Vector128<short>  ReverseElementBytes(Vector128<short>  value) { throw null; }
    public static Vector128<uint>   ReverseElementBytes(Vector128<uint>   value) { throw null; }
    public static Vector128<int>    ReverseElementBytes(Vector128<int>    value) { throw null; }
    public static Vector128<ulong>  ReverseElementBytes(Vector128<ulong>  value) { throw null; }
    public static Vector128<long>   ReverseElementBytes(Vector128<long>   value) { throw null; }
    public static Vector128<float>  ReverseElementBytes(Vector128<float>  value) { throw null; }

    public static class Arm32
    {
        public static bool IsSupported { get { throw null; } }

        /// <summary>
        /// Vector multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM32 VMLA
        /// </summary>=
        public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM32 VMLA
        /// </summary>
        public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
        public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

        /// <summary>
        /// Vector multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM32 VMLS
        /// </summary>
        public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM32 VMLS
        /// </summary>
        public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
        public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
    }

    public static class Arm64
    {
        public static bool IsSupported { get { throw null; } }

        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
        /// </summary>
        public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP & FADDP
        /// </summary>
        public static Vector128<T>      AddPairwise<byte>(Vector128<byte> left, Vector128<byte> right)  { throw null; }
        public static Vector128<T>      AddPairwise<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right)  { throw null; }
        public static Vector128<T>      AddPairwise<ushort>(Vector128<ushort> left, Vector128<ushort> right)  { throw null; }
        public static Vector128<T>      AddPairwise<short>(Vector128<short> left, Vector128<short> right)  { throw null; }
        public static Vector128<long>   AddPairwise<long>(Vector128<long>  left, Vector128<long>  right)  { throw null; }
        public static Vector128<ulong>  AddPairwise<ulong>(Vector128<ulong>  left, Vector128<ulong>  right)  { throw null; }
        public static Vector128<T>      AddPairwise<float>(Vector128<float> left, Vector128<float> right)  { throw null; }
        public static Vector128<T>      AddPairwise<double>(Vector128<double> left, Vector128<double> right)  { throw null; }

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector128<double> ExtractVector<double>(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

        /// <summary>
        /// Vector add across vector elements
        /// Corresponds to vector forms of ARM64 ADDV
        /// </summary>
        public static byte   AddAcross(Vector64<byte>    value) { throw null; }
        public static sbyte  AddAcross(Vector64<sbyte>   value) { throw null; }
        public static ushort AddAcross(Vector64<ushort>  value) { throw null; }
        public static short  AddAcross(Vector64<short>   value) { throw null; }
        public static uint   AddAcross(Vector64<uint>    value) { throw null; }
        public static int    AddAcross(Vector64<int>     value) { throw null; }
        public static byte   AddAcross(Vector128<byte>   value) { throw null; }
        public static sbyte  AddAcross(Vector128<sbyte>  value) { throw null; }
        public static ushort AddAcross(Vector128<ushort> value) { throw null; }
        public static short  AddAcross(Vector128<short>  value) { throw null; }
        public static uint   AddAcross(Vector128<uint>   value) { throw null; }
        public static int    AddAcross(Vector128<int>    value) { throw null; }

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector128<double> MaxNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector max numeric pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 FMAXNMP
        /// </summary>
        public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
        public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector max numeric across
        ///
        /// result = max(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 FMAXNMV
        /// </summary>
        public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
        /// </summary>
        public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
        public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
        public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector max across
        ///
        /// result = max(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 SMAXV, UMAXV & FMAXV
        /// </summary>
        public static byte   MaxAcross(Vector64<byte>    value) { throw null; }
        public static sbyte  MaxAcross(Vector64<sbyte>   value) { throw null; }
        public static ushort MaxAcross(Vector64<ushort>  value) { throw null; }
        public static short  MaxAcross(Vector64<short>   value) { throw null; }
        public static uint   MaxAcross(Vector64<uint>    value) { throw null; }
        public static int    MaxAcross(Vector64<int>     value) { throw null; }
        public static float  MaxAcross(Vector64<float>   value) { throw null; }
        public static byte   MaxAcross(Vector128<byte>   value) { throw null; }
        public static sbyte  MaxAcross(Vector128<sbyte>  value) { throw null; }
        public static ushort MaxAcross(Vector128<ushort> value) { throw null; }
        public static short  MaxAcross(Vector128<short>  value) { throw null; }
        public static uint   MaxAcross(Vector128<uint>   value) { throw null; }
        public static int    MaxAcross(Vector128<int>    value) { throw null; }
        public static ulong  MaxAcross(Vector128<ulong>  value) { throw null; }
        public static long   MaxAcross(Vector128<long>   value) { throw null; }
        public static float  MaxAcross(Vector128<float>  value) { throw null; }
        public static double MaxAcross(Vector128<double> value) { throw null; }

        /// <summary>
        /// Vector min numeric
        /// Corresponds to vector forms of ARM64 FMINNM
        /// </summary>
        public static Vector128<double> MinNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector min numeric pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 FMINNMP
        /// </summary>
        public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
        public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector min numeric across
        ///
        /// result = min(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 FMINNMV
        /// </summary>
        public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector min pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
        /// </summary>
        public static Vector128<byte>   MinPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MinPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MinPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MinPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MinPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
        public static Vector128<float>  MinPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
        public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector min across
        ///
        /// result = max(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 SMINV, UMINV & FMINV
        /// </summary>
        public static byte   MinAcross(Vector64<byte>    value) { throw null; }
        public static sbyte  MinAcross(Vector64<sbyte>   value) { throw null; }
        public static ushort MinAcross(Vector64<ushort>  value) { throw null; }
        public static short  MinAcross(Vector64<short>   value) { throw null; }
        public static uint   MinAcross(Vector64<uint>    value) { throw null; }
        public static int    MinAcross(Vector64<int>     value) { throw null; }
        public static float  MinAcross(Vector64<float>   value) { throw null; }
        public static byte   MinAcross(Vector128<byte>   value) { throw null; }
        public static sbyte  MinAcross(Vector128<sbyte>  value) { throw null; }
        public static ushort MinAcross(Vector128<ushort> value) { throw null; }
        public static short  MinAcross(Vector128<short>  value) { throw null; }
        public static uint   MinAcross(Vector128<uint>   value) { throw null; }
        public static int    MinAcross(Vector128<int>    value) { throw null; }
        public static float  MinAcross(Vector128<float>  value) { throw null; }
        public static double MinAcross(Vector128<double> value) { throw null; }

        /// <summary>
        /// Vector fused multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLA
        /// </summary>
        public static Vector128<double> FusedMultiplyAdd(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector fused multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 FMLA
        /// </summary>
        public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
        public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
        public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

        /// <summary>
        /// Vector fused multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLS
        /// </summary>
        public static Vector128<double>  FusedMultiplySubtract(Vector128<double>  acc, Vector128<double>  left, Vector128<double>  right) { throw null; }

        /// <summary>
        /// Vector fused multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 FMLS
        /// </summary>
        public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
        public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
        public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, float   right) { throw null; }

        /// <summary>
        /// Vector multiply extend
        ///
        /// For each element result[elem] = left[elem] * right[elem]
        /// Handle extend special cases zero and infinite.  FMULX
        ///
        /// Corresponds to vector forms of ARM64 FMULX
        /// </summary>
        public static Vector64<float>   MultiplyExtend(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MultiplyExtend(Vector128<float>  left, Vector128<float>  right) { throw null; }
        public static Vector128<double> MultiplyExtend(Vector128<double> left, Vector128<double> right) { throw null; }

        /// <summary>
        /// Vector multiply extend by element
        ///
        /// For each element result[elem] = left[elem] * right
        /// Handle extend special cases zero and infinite.  FMULX
        ///
        /// Corresponds to vector forms of ARM64 FMULX
        /// </summary>
        public static Vector64<float>   MultiplyExtend(Vector64<float>   left, float  right) { throw null; }
        public static Vector128<float>  MultiplyExtend(Vector128<float>  left, float  right) { throw null; }
        public static Vector128<double> MultiplyExtend(Vector128<double> left, double right) { throw null; }

        /// Vector reciprocal estimate
        ///
        /// See FRECPE docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPE
        /// </summary>
        public static Vector128<double> ReciprocalEstimate(Vector128<double> value) { throw null; }

        /// <summary>
        /// Vector reciprocal step
        ///
        /// See FRECPS docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPS
        /// </summary>
        public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

        /// <summary>
        /// Vector reciprocal square root estimate
        ///
        /// See FRSQRTE docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTE
        /// </summary>
        public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value) { throw null; }

        /// <summary>
        /// Vector reciprocal square root step
        ///
        /// See FRSQRTS docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTS
        /// </summary>
        public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

        /// <summary>
        /// Vector reverse byte bits
        /// Corresponds to vector forms of ARM64 RBIT
        /// </summary>
        public static Vector64<byte>    ReverseElementBits(Vector64<byte>    value) { throw null; }
        public static Vector64<sbyte>   ReverseElementBits(Vector64<sbyte>   value) { throw null; }
        public static Vector128<byte>   ReverseElementBits(Vector128<byte>   value) { throw null; }
        public static Vector128<sbyte>  ReverseElementBits(Vector128<sbyte>  value) { throw null; }
    }

}
}
```

api-approved arch-arm64 area-System.Runtime.Intrinsics

Source

sdmaclea

Most helpful comment

If you comment on what needs changed, I am happy to update.

I can modify the original comment on any of these and you shouldn't need to worry about it 😄

I can also handle ensuring this gets a slot on the API review schedule, etc.

tannergooding on 7 Oct 2019

👍2 🚀1 🎉1

All 77 comments

This is the next wave of SIMD instructions which I plan to implement

@CarolEidt @RussKeldorph @eerhardt PTAL
@tannergooding @4creators @fiigii @dotnet/arm64-contrib @dotnet/jit-contrib

sdmaclea on 25 Jan 2018

This introduces more complicated intrinsic overloads. For instance

Multiply(Vector64<float>, Vector64<float>)
Multiply(Vector64<float>, float)

This introduces some implementation complexity. lookupHWIntrinsic must check Method arguments.

It may make the API less safe.

The second form represents multiply by vector element. It could be renamed to

MultiplyByElement(Vector64<float>, float)

There are other cases Extract, MultiplyAdd ...

Opinions?

sdmaclea on 25 Jan 2018

Looks like X86 intrinsics is using MultiplyScalar(Vector64<float>, float) for Arm64's
MultiplyByElement(Vector64<float>, float). I'm OK with renaming.

sdmaclea on 30 Jan 2018

Looks like X86 intrinsics is using MultiplyScalar(Vector64, float) for Arm64's
MultiplyByElement(Vector64, float). I'm OK with renaming.

X86 does not have MultiplyScalar(Vector64<float>, float). In Intel HW intrinsics, Scalar means operating over Vector128<T> but only computing the first element.

fiigii on 30 Jan 2018

👍2

Hi, I'm wondering what the status of this proposal is?

TamarChristinaArm on 7 Oct 2019

@TamarChristinaArm When I moved to Microsoft, I stopped championing this. In my opinion it was in good shape when I left it.

The proposal is probably a little out of date. When some of the other intrinsic API were approved, the namespace changed.

If someone was motivated to implement this and we had consensus, the next step would be to mark this as API ready for review and have an API design review.

sdmaclea on 7 Oct 2019

In my opinion it was in good shape when I left it.

@sdmaclea I agree. I have some of this implemented as I was going off a different list, but It would be best just to get these approved.

The namespace change shouldn't affect this much so I think it's fine to review as is. The other approved APIs need slight changes due to the namespaces too but it's easier to have them all approved as is (as it's mostly about the intrinsics themselves no? the namespace they end up in is determined by the ISA).

If someone was motivated to implement this and we had consensus, the next step would be to mark this as API ready for review and have an API design review.

How should I go about this? do I just add the label?

TamarChristinaArm on 7 Oct 2019

I doubt you can add the label. I'll add it. I just tried but GitHub seems to have issues at the moment.

sdmaclea on 7 Oct 2019

The namespace change shouldn't affect this much so I think it's fine to review as is.

Part of this is trivial and just involves updating the class name and namespace name.

The more difficult part comes from pulling out what is shared vs what is ARM64 specific; which needs to be done anyways.

I'm fine with marking this ready-for-review, but I'd like to see us get the proposal updated before it is reviewed, if possible. It tends to make the entire process easier and is ultimately part of implementing it anyways.

tannergooding on 7 Oct 2019

👍1

@terrajobst Can we schedule this for API review? Can we include Arm on the call?

sdmaclea on 7 Oct 2019

@tannergooding @TamarChristinaArm My head is not in this space at the moment. I am happy to play admin, but I can't drive this.

If you comment on what needs changed, I am happy to update.

Either way marking ready for review seems fine. It will take at least a week to schedule the review. We should clean up as quickly as practical.

sdmaclea on 7 Oct 2019

I'll post the changes require to adhere to dotnet/corefx#37199 today. I've already started on it.

TamarChristinaArm on 7 Oct 2019

If you comment on what needs changed, I am happy to update.

I can modify the original comment on any of these and you shouldn't need to worry about it 😄

I can also handle ensuring this gets a slot on the API review schedule, etc.

tannergooding on 7 Oct 2019

👍2 🚀1 🎉1

Extract in this proposal uses the same name for the intrinsics as in dotnet/runtime#24588, there's no clash because the overloads are different but they are completely different intrinsics. Should this one instead be something like ExtractVector?

TamarChristinaArm on 7 Oct 2019

👍1

I think ExtractVector makes sense, given my understanding of the API.

tannergooding on 7 Oct 2019

👍1

new list below

changes:

drop Multiply (already implemented in dotnet/runtime#24588)
rename Extract to ExtractVector
expand AddPairwise away from generics
expand ExtractVector away from generics
drop unsupported overloads, e.g. from MaxNumericAcross.
updated some comments
remove FRECPX as that has no vector versions
Separate shared and A64 only intrinsics
Separate MLA/MLS form FMA/FMS

I believe this is the full rewritten list, I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.

namespace System.Runtime.Intrinsics.Arm
{
    public static class Simd
    {
        public static bool IsSupported { get { throw null; } }

        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThanOrEqual(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  AbsoluteCompareGreaterThanOrEqual(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
        /// </summary>
        public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right) { throw null; }
        public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector absolute difference add
        ///
        /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
        ///
        /// Corresponds to vector forms of ARM64 SABA, UABA
        /// </summary>
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP & FADDP
        /// </summary>
        public static Vector64<byte>   AddPairwise<byte>(Vector64<byte>  left, Vector64<byte>  right)  { throw null; }
        public static Vector64<sbyte>  AddPairwise<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right)  { throw null; }
        public static Vector64<ushort> AddPairwise<ushort>(Vector64<ushort>  left, Vector64<ushort>  right)  { throw null; }
        public static Vector64<short>  AddPairwise<short>(Vector64<short>  left, Vector64<short>  right)  { throw null; }
        public static Vector64<int>    AddPairwise<int>(Vector64<int>  left, Vector64<int>  right)  { throw null; }
        public static Vector64<uint>   AddPairwise<uint>(Vector64<uint>  left, Vector64<uint>  right)  { throw null; }
        public static Vector64<float>  AddPairwise<float>(Vector64<float>  left, Vector64<float>  right)  { throw null; }

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector64<byte>   ExtractVector<byte>(Vector64<byte>  left, Vector64<byte>  right, byte index) { throw null; }
        public static Vector64<sbyte>  ExtractVector<sbyte>(Vector64<sbyte>  left, Vector64<sbyte>  right, byte index) { throw null; }
        public static Vector64<short>  ExtractVector<short>(Vector64<short>  left, Vector64<short>  right, byte index) { throw null; }
        public static Vector64<ushort> ExtractVector<ushort>(Vector64<ushort>  left, Vector64<ushort>  right, byte index) { throw null; }
        public static Vector64<int>    ExtractVector<int>(Vector64<int>  left, Vector64<int>  right, byte index) { throw null; }
        public static Vector64<uint>   ExtractVector<uint>(Vector64<uint>  left, Vector64<uint>  right, byte index) { throw null; }

        public static Vector128<byte>   ExtractVector<byte>(Vector128<byte> left, Vector128<byte> right, byte index) { throw null; }
        public static Vector128<sbyte>  ExtractVector<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right, byte index) { throw null; }
        public static Vector128<short>  ExtractVector<short>(Vector128<short> left, Vector128<short> right, byte index) { throw null; }
        public static Vector128<ushort> ExtractVector<ushort>(Vector128<ushort> left, Vector128<ushort> right, byte index) { throw null; }
        public static Vector128<int>    ExtractVector<int>(Vector128<int> left, Vector128<int> right, byte index) { throw null; }
        public static Vector128<uint>   ExtractVector<uint>(Vector128<uint> left, Vector128<uint> right, byte index) { throw null; }
        public static Vector128<long>   ExtractVector<long>(Vector128<long> left, Vector128<long> right, byte index) { throw null; }
        public static Vector128<ulong>  ExtractVector<ulong>(Vector128<ulong> left, Vector128<ulong> right, byte index) { throw null; }
        public static Vector128<float>  ExtractVector<double>(Vector128<float> left, Vector128<float> right, byte index) { throw null; }

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector64<float>   MaxNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MaxNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
        /// </summary>
        public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

        /// <summary>
        /// Vector min numeric
        /// Corresponds to vector forms of ARM64 FMINNM
        /// </summary>
        public static Vector64<float>   MinNumeric(Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  MinNumeric(Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector min pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
        /// </summary>
        public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }

        /// <summary>
        /// Vector multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAdd(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
        public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
        public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
        public static Vector64<short>   MultiplyAdd(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
        public static Vector64<uint>    MultiplyAdd(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
        public static Vector64<int>     MultiplyAdd(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
        public static Vector128<byte>   MultiplyAdd(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
        public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
        public static Vector128<ushort> MultiplyAdd(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
        public static Vector128<short>  MultiplyAdd(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
        public static Vector128<uint>   MultiplyAdd(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
        public static Vector128<int>    MultiplyAdd(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

        /// <summary>
        /// Vector multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, Vector64<ushort>  right) { throw null; }
        public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, Vector64<short>   right) { throw null; }
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, Vector64<uint>    right) { throw null; }
        public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, Vector64<int>     right) { throw null; }
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, Vector128<ushort> right) { throw null; }
        public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, Vector128<short>  right) { throw null; }
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, Vector128<uint>   right) { throw null; }
        public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, Vector128<int>    right) { throw null; }

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtract(Vector64<byte>    acc, Vector64<byte>    left, byte    right) { throw null; }
        public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   acc, Vector64<sbyte>   left, sbyte   right) { throw null; }
        public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  acc, Vector64<ushort>  left, ushort  right) { throw null; }
        public static Vector64<short>   MultiplySubtract(Vector64<short>   acc, Vector64<short>   left, short   right) { throw null; }
        public static Vector64<uint>    MultiplySubtract(Vector64<uint>    acc, Vector64<uint>    left, uint    right) { throw null; }
        public static Vector64<int>     MultiplySubtract(Vector64<int>     acc, Vector64<int>     left, int     right) { throw null; }
        public static Vector128<byte>   MultiplySubtract(Vector128<byte>   acc, Vector128<byte>   left, byte    right) { throw null; }
        public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  acc, Vector128<sbyte>  left, sbyte   right) { throw null; }
        public static Vector128<ushort> MultiplySubtract(Vector128<ushort> acc, Vector128<ushort> left, ushort  right) { throw null; }
        public static Vector128<short>  MultiplySubtract(Vector128<short>  acc, Vector128<short>  left, short   right) { throw null; }
        public static Vector128<uint>   MultiplySubtract(Vector128<uint>   acc, Vector128<uint>   left, uint    right) { throw null; }
        public static Vector128<int>    MultiplySubtract(Vector128<int>    acc, Vector128<int>    left, int     right) { throw null; }

        /// <summary>
        /// Vector fused multiply add
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLA
        /// </summary>
        public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

        /// <summary>
        /// Vector fused multiply subtract
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
        ///
        /// Corresponds to vector forms of ARM64 FMLS
        /// </summary>
        public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
        public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }


        /// <summary>
        /// Vector polynomial multiply
        /// Corresponds to vector forms of ARM64 PMUL
        /// </summary>
        public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right) { throw null; }
        public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right) { throw null; }
        public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
        public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }

        /// Vector reciprocal estimate
        ///
        /// See FRECPE docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPE
        /// </summary>
        public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value) { throw null; }
        public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector reciprocal step
        ///
        /// See FRECPS docs
        ///
        /// Corresponds to vector forms of ARM64 FRECPS
        /// </summary>
        public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
        public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

        /// <summary>
        /// Vector reciprocal square root estimate
        ///
        /// See FRSQRTE docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTE
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value) { throw null; }
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value) { throw null; }

        /// <summary>
        /// Vector reciprocal square root step
        ///
        /// See FRSQRTS docs
        ///
        /// Corresponds to vector forms of ARM64 FRSQRTS
        /// </summary>
        public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   left, Vector64<float>   right, byte index) { throw null; }
        public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  left, Vector128<float>  right, byte index) { throw null; }

        /// <summary>
        /// Vector reverse element bytes
        /// Corresponds to vector forms of ARM64 REV16, REV32, REV64
        /// </summary>
        public static Vector64<ushort>  ReverseElementBytes(Vector64<ushort>  value) { throw null; }
        public static Vector64<short>   ReverseElementBytes(Vector64<short>   value) { throw null; }
        public static Vector64<uint>    ReverseElementBytes(Vector64<uint>    value) { throw null; }
        public static Vector64<int>     ReverseElementBytes(Vector64<int>     value) { throw null; }
        public static Vector64<float>   ReverseElementBytes(Vector64<float>   value) { throw null; }
        public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
        public static Vector128<short>  ReverseElementBytes(Vector128<short>  value) { throw null; }
        public static Vector128<uint>   ReverseElementBytes(Vector128<uint>   value) { throw null; }
        public static Vector128<int>    ReverseElementBytes(Vector128<int>    value) { throw null; }
        public static Vector128<ulong>  ReverseElementBytes(Vector128<ulong>  value) { throw null; }
        public static Vector128<long>   ReverseElementBytes(Vector128<long>   value) { throw null; }
        public static Vector128<float>  ReverseElementBytes(Vector128<float>  value) { throw null; }

        public static class Arm32
        {
            public static bool IsSupported { get { throw null; } }

            /// <summary>
            /// Vector multiply add
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM32 VMLA
            /// </summary>=
            public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

            /// <summary>
            /// Vector multiply add by element
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right
            ///
            /// Corresponds to vector forms of ARM32 VMLA
            /// </summary>
            public static Vector64<float>   MultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  MultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

            /// <summary>
            /// Vector multiply subtract
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM32 VMLS
            /// </summary>
            public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, Vector128<float>  right) { throw null; }

            /// <summary>
            /// Vector multiply subtract by element
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right
            ///
            /// Corresponds to vector forms of ARM32 VMLS
            /// </summary>
            public static Vector64<float>   MultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  MultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
        }

        public static class Arm64
        {
            public static bool IsSupported { get { throw null; } }

            /// <summary>
            /// Vector CompareGreaterThanOrEqual
            /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
            /// Corresponds to vector forms of ARM64 FACGE
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector CompareGreaterThan
            ///
            /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
            ///
            /// Corresponds to vector forms of ARM64 FACGT
            /// </summary>
            public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector absolute difference
            /// Corresponds to vector forms of ARM64 SABD, UABD & FABD
            /// </summary>
            public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector add pairwise
            /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
            /// Corresponds to vector forms of ARM64 ADDP & FADDP
            /// </summary>
            public static Vector128<T>      AddPairwise<byte>(Vector128<byte> left, Vector128<byte> right)  { throw null; }
            public static Vector128<T>      AddPairwise<sbyte>(Vector128<sbyte> left, Vector128<sbyte> right)  { throw null; }
            public static Vector128<T>      AddPairwise<ushort>(Vector128<ushort> left, Vector128<ushort> right)  { throw null; }
            public static Vector128<T>      AddPairwise<short>(Vector128<short> left, Vector128<short> right)  { throw null; }
            public static Vector128<long>   AddPairwise<long>(Vector128<long>  left, Vector128<long>  right)  { throw null; }
            public static Vector128<ulong>  AddPairwise<ulong>(Vector128<ulong>  left, Vector128<ulong>  right)  { throw null; }
            public static Vector128<T>      AddPairwise<float>(Vector128<float> left, Vector128<float> right)  { throw null; }
            public static Vector128<T>      AddPairwise<double>(Vector128<double> left, Vector128<double> right)  { throw null; }

            /// <summary>
            /// Vector extract from pair of vectors
            /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
            ///
            /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
            ///
            /// Corresponds to vector forms of ARM64 EXT
            /// </summary>
            public static Vector128<double> ExtractVector<double>(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector add across vector elements
            /// Corresponds to vector forms of ARM64 ADDV
            /// </summary>
            public static byte   AddAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  AddAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort AddAcross(Vector64<ushort>  value) { throw null; }
            public static short  AddAcross(Vector64<short>   value) { throw null; }
            public static uint   AddAcross(Vector64<uint>    value) { throw null; }
            public static int    AddAcross(Vector64<int>     value) { throw null; }
            public static byte   AddAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  AddAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort AddAcross(Vector128<ushort> value) { throw null; }
            public static short  AddAcross(Vector128<short>  value) { throw null; }
            public static uint   AddAcross(Vector128<uint>   value) { throw null; }
            public static int    AddAcross(Vector128<int>    value) { throw null; }

            /// <summary>
            /// Vector max numeric
            /// Corresponds to vector forms of ARM64 FMAXNM
            /// </summary>
            public static Vector128<double> MaxNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMP
            /// </summary>
            public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max numeric across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMAXNMV
            /// </summary>
            public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

            /// <summary>
            /// Vector max pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMAXP, UMAXP & FMAXP
            /// </summary>
            public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector max across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMAXV, UMAXV & FMAXV
            /// </summary>
            public static byte   MaxAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  MaxAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort MaxAcross(Vector64<ushort>  value) { throw null; }
            public static short  MaxAcross(Vector64<short>   value) { throw null; }
            public static uint   MaxAcross(Vector64<uint>    value) { throw null; }
            public static int    MaxAcross(Vector64<int>     value) { throw null; }
            public static float  MaxAcross(Vector64<float>   value) { throw null; }
            public static byte   MaxAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  MaxAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort MaxAcross(Vector128<ushort> value) { throw null; }
            public static short  MaxAcross(Vector128<short>  value) { throw null; }
            public static uint   MaxAcross(Vector128<uint>   value) { throw null; }
            public static int    MaxAcross(Vector128<int>    value) { throw null; }
            public static ulong  MaxAcross(Vector128<ulong>  value) { throw null; }
            public static long   MaxAcross(Vector128<long>   value) { throw null; }
            public static float  MaxAcross(Vector128<float>  value) { throw null; }
            public static double MaxAcross(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector min numeric
            /// Corresponds to vector forms of ARM64 FMINNM
            /// </summary>
            public static Vector128<double> MinNumeric(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min numeric pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 FMINNMP
            /// </summary>
            public static Vector64<float>   MaxNumericPairwise(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MaxNumericPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MaxNumericPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min numeric across
            ///
            /// result = min(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 FMINNMV
            /// </summary>
            public static float  MaxNumericAcross(Vector128<float>  value) { throw null; }

            /// <summary>
            /// Vector min pairwise
            ///
            /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
            ///
            /// Corresponds to vector forms of ARM64 SMINP, UMINP & FMINP
            /// </summary>
            public static Vector128<byte>   MinPairwise(Vector128<byte>   left, Vector128<byte>   right) { throw null; }
            public static Vector128<sbyte>  MinPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right) { throw null; }
            public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right) { throw null; }
            public static Vector128<short>  MinPairwise(Vector128<short>  left, Vector128<short>  right) { throw null; }
            public static Vector128<uint>   MinPairwise(Vector128<uint>   left, Vector128<uint>   right) { throw null; }
            public static Vector128<int>    MinPairwise(Vector128<int>    left, Vector128<int>    right) { throw null; }
            public static Vector128<float>  MinPairwise(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector min across
            ///
            /// result = max(value[0], ... , value[length -1])
            ///
            /// Corresponds to vector forms of ARM64 SMINV, UMINV & FMINV
            /// </summary>
            public static byte   MinAcross(Vector64<byte>    value) { throw null; }
            public static sbyte  MinAcross(Vector64<sbyte>   value) { throw null; }
            public static ushort MinAcross(Vector64<ushort>  value) { throw null; }
            public static short  MinAcross(Vector64<short>   value) { throw null; }
            public static uint   MinAcross(Vector64<uint>    value) { throw null; }
            public static int    MinAcross(Vector64<int>     value) { throw null; }
            public static float  MinAcross(Vector64<float>   value) { throw null; }
            public static byte   MinAcross(Vector128<byte>   value) { throw null; }
            public static sbyte  MinAcross(Vector128<sbyte>  value) { throw null; }
            public static ushort MinAcross(Vector128<ushort> value) { throw null; }
            public static short  MinAcross(Vector128<short>  value) { throw null; }
            public static uint   MinAcross(Vector128<uint>   value) { throw null; }
            public static int    MinAcross(Vector128<int>    value) { throw null; }
            public static float  MinAcross(Vector128<float>  value) { throw null; }
            public static double MinAcross(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector fused multiply add
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM64 FMLA
            /// </summary>
            public static Vector128<double> FusedMultiplyAdd(Vector128<double> acc, Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector fused multiply add by element
            ///
            /// For each element result[elem] = acc[elem] + left[elem] * right
            ///
            /// Corresponds to vector forms of ARM64 FMLA
            /// </summary>
            public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }

            /// <summary>
            /// Vector fused multiply subtract
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
            ///
            /// Corresponds to vector forms of ARM64 FMLS
            /// </summary>
            public static Vector128<double>  FusedMultiplySubtract(Vector128<double>  acc, Vector128<double>  left, Vector128<double>  right) { throw null; }

            /// <summary>
            /// Vector fused multiply subtract by element
            ///
            /// For each element result[elem] = acc[elem] - left[elem] * right
            ///
            /// Corresponds to vector forms of ARM64 FMLS
            /// </summary>
            public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   acc, Vector64<float>   left, float   right) { throw null; }
            public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  acc, Vector128<float>  left, float   right) { throw null; }
            public static Vector128<double> FusedMultiplySubtract(Vector128<double> acc, Vector128<double> left, float   right) { throw null; }

            /// <summary>
            /// Vector multiply extend
            ///
            /// For each element result[elem] = left[elem] * right[elem]
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, Vector64<float>   right) { throw null; }
            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, Vector128<float>  right) { throw null; }
            public static Vector128<double> MultiplyExtend(Vector128<double> left, Vector128<double> right) { throw null; }

            /// <summary>
            /// Vector multiply extend by element
            ///
            /// For each element result[elem] = left[elem] * right
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtend(Vector64<float>   left, float  right) { throw null; }
            public static Vector128<float>  MultiplyExtend(Vector128<float>  left, float  right) { throw null; }
            public static Vector128<double> MultiplyExtend(Vector128<double> left, double right) { throw null; }

            /// Vector reciprocal estimate
            ///
            /// See FRECPE docs
            ///
            /// Corresponds to vector forms of ARM64 FRECPE
            /// </summary>
            public static Vector128<double> ReciprocalEstimate(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector reciprocal step
            ///
            /// See FRECPS docs
            ///
            /// Corresponds to vector forms of ARM64 FRECPS
            /// </summary>
            public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector reciprocal square root estimate
            ///
            /// See FRSQRTE docs
            ///
            /// Corresponds to vector forms of ARM64 FRSQRTE
            /// </summary>
            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value) { throw null; }

            /// <summary>
            /// Vector reciprocal square root step
            ///
            /// See FRSQRTS docs
            ///
            /// Corresponds to vector forms of ARM64 FRSQRTS
            /// </summary>
            public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> left, Vector128<double> right, byte index) { throw null; }

            /// <summary>
            /// Vector reverse byte bits
            /// Corresponds to vector forms of ARM64 RBIT
            /// </summary>
            public static Vector64<byte>    ReverseElementBits(Vector64<byte>    value) { throw null; }
            public static Vector64<sbyte>   ReverseElementBits(Vector64<sbyte>   value) { throw null; }
            public static Vector128<byte>   ReverseElementBits(Vector128<byte>   value) { throw null; }
            public static Vector128<sbyte>  ReverseElementBits(Vector128<sbyte>  value) { throw null; }
        }
  }
}

TamarChristinaArm on 7 Oct 2019

alright, that should be the final list.

TamarChristinaArm on 8 Oct 2019

Thanks @TamarChristinaArm.

I'll give this a look over either tonight or tomorrow and get the top post updated 😄

tannergooding on 8 Oct 2019

@tannergooding What is the next step here for getting this reviewed? Is it ready to go?
cc @TamarChristinaArm @echesakovMSFT @CarolEidt @sdmaclea

BruceForstall on 16 Oct 2019

What is the next step here for getting this reviewed? Is it ready to go?

I've updated the original post with @TamarChristinaArm's updated surface.

The next step is just ensuring we get a dedicated review session with @terrajobst. I'll bring it up again today and see if we can drive down a date.

tannergooding on 16 Oct 2019

I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.

@TamarChristinaArm, was this determining if VMLA (floating-point) for arm32 is a fused operation? I believe that ended up being my only question and if that dictated them needing to be separate.

If so, it doesn't look like A32 has its own "fused" operation and we should remove those APIs from the general list (in the proposed surface, they look duplicated in both AdvSimd and AdvSimd.Arm64).

tannergooding on 16 Oct 2019

I've asked a question internally about MLA and will update this tomorrow if needed when I get an answer.

@TamarChristinaArm, was this determining if VMLA (floating-point) for arm32 is a fused operation? I believe that ended up being my only question and if that dictated them needing to be separate.

If so, it doesn't look like A32 has its own "fused" operation and we should remove those APIs from the general list (in the proposed surface, they look duplicated in both AdvSimd and AdvSimd.Arm64).

That's where indeed things got a bit confusing, on A32 the fused version of the instructions are called VFMA, but it doesn't have a fused by element version of it.

A64 however only has fused MLA, and so doesn't have the non-fused variant, but does have a by element version of the fused variant.

This is why the split in definition above. The MLA is still useful on A32 if you don't care about the rounding because it does have a by element version then.

TamarChristinaArm on 16 Oct 2019

👍1

I will take this issue if no one is working on it.

echesakovMSFT on 30 Jan 2020

I'm not working on it, AddAcross and ReverseElementBits I already did since those intersected with my list but didn't work on the rest.

TamarChristinaArm on 30 Jan 2020

👍1

@TamarChristinaArm @tannergooding @CarolEidt

I have a question - for MaxNumericPairwise and MinNumericPairwise there is no overloads that operate on one vector (i.e. Vector64<float> MaxNumericPairwise(Vector64<float> value)) even though there is a c++ intrinsic float32_t vpmaxnms_f32 (float32x2_t a) that maps to FMAXNMP Sd,Vn.2S instruction. Same for the Vector128<double> MaxNumericPairwise(Vector128<double> value).

Is it intentional?

It might look odd if we had MaxNumericPairwise that has overloads with 1 and 2 operands. Should we instead add MaxNumericAcross(Vector64<float> value) that maps to FMAXNMP Sd,Vn.2S?

I think the same could be done for MaxAcross(Vector64<float> value) and FMAXP Sd,Vn.2S

echesakovMSFT on 30 Jan 2020

To me it seems like these should have the same name, and have overloads with one or two operands, since the fundamental operation is the same. It's a bit weird because the operation is always pairwise, but the number of operations & results isn't always consistent for the one operand and two operand case, if I read it correctly (i.e the one operand form always produces a single result, while the two-operand case always operates on each pair in the concatenated vector, but that's simply a characteristic of the architecture that we're exposing.

CarolEidt on 31 Jan 2020

@echesakovMSFT Thanks for reminding me, I was waiting for the API review to ask, but yes, so I personally think it the pair single register versions should instead by under the reduction intrinsics.

So instead of having a single register MaxNumericPairwise(Vector64<float> value) it should be under MaxAcross.

In C we defined them under vmaxv (Which I didn't here waiting to solicit feedback) and we put them under a new made up intrinsics name since we couldn't overload it. To me it seems more natural to add these single register pairwise operations as reductions.

Like @CarolEidt mentioned the operations aren't exactly the same if we overload MaxNumericPairwise, and also I think we'd be breaking the convention we've used until now for the operations working on the scalar part of the SIMD file. So shouldn't the single register version be MaxNumericPairwiseScalar then?

Or do we want both like in C? overload the reductions and the pair instructions?

TamarChristinaArm on 31 Jan 2020

So shouldn't the single register version be MaxNumericPairwiseScalar then?

This is my understanding of the conventions we have followed thus-far. The instructions is FMAXNMP (scalar) and functionally it is a scalar (for which we have always used the Scalar postfix in the name).

The confusion likely comes because, so far, scalar just means "lowest element" and so for something like Vector64<float> AddScalar(Vector64<float> lhs, Vector64<float> rhs), it adds Element 0 from lhs with Element 0 from rhs and returns it in Element 0 of the result.

In this case, the signature would be Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value) which would operate on the scalar pair (Element 0 and Element 1) in value and return it in Element 0 of the result.
So, it is still scalar, it is just that we are "pair-wise" rather than "element-wise"

tannergooding on 31 Jan 2020

Also, to this note:

Vector64<float> AddScalar(Vector64<float> lhs, Vector64<float> rhs)

I believe the proposed signatures for things like the following are incorrect:

public static byte   MinAcross(Vector64<byte>    value) { throw null; }
public static sbyte  MinAcross(Vector64<sbyte>   value) { throw null; }

The instructions return the result in a SIMD&FP register, not in a general purpose register. So the result should remain Vector64<T>. The purpose of the scalar variants is to remove the need to continuously transition between "scalar" code and "vector" code. If it is an intrinsic that operates on or returns a SIMD register (even if it only treats that value as a scalar), it takes and returns a Vector*<T>.

On x86, this principal avoided the chance for the upper bits from being trashed or lost
On ARM, (since upper bits are zeroed) it just simplifies the overall logic and avoids cases like "you must go back to float for simple ops (like addition) but must use HWIntrinsics for other things (like reciprocal) (and it maintains consistency with the x86 intrinsics from an API perspective)

tannergooding on 31 Jan 2020

I believe the proposed signatures for things like the following are incorrect:

Yes the reductions need to be updated. They were based on my understanding the time where I thought you could assign types to multiple register classes.

That said I still think the scalar pairwise operations should be reductions. I think the fact that we exposed them as extra intrinsics as well in C was a mistake. But If we expose them as Scalar operations here I think they should also be an overload for the reductions.

Mistuke on 31 Jan 2020

I agree that functionally speaking it could be exposed as either Vector64<float> MaxNumericAcross(Vector64<float> value) or as Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value).

Which is ultimately chosen will probably come down to API review. I have a slight preference for the latter since the actual instruction is FMAXNMP (scalar), but the former may be easier for some to understand.
I imagine with the former, a misunderstanding might be whether the result is only in element 0 or if it is duplicated throughout (but the same applies for the other "across" methods as well).

tannergooding on 31 Jan 2020

In my opinion, something that operates on multiple elements within a vector should not have the Scalar suffix. If that's the naming chosen, then I would reiterate my request for a clear explanation of what we mean by that suffix.

CarolEidt on 31 Jan 2020

👍2

I agree with Carol - in my opinion, exposing FMAXNMP Sd, Vn.2S as MaxNumericAcross(Vector64<float> value) would introduce less confusion that going with MaxNumericPairwiseScalar.

And as it was mentioned above we would need to rename all AddAcross and other reduction functions which wouldn't add a value to understanding what these functions are doing - it's already clear from the name.

I think we should use Scalar suffix only to distinguish overloads of an intrinsic that have same set of argument in its "Vector" version.

echesakovMSFT on 31 Jan 2020

If that's the naming chosen, then I would reiterate my request for a clear explanation of what we mean by that suffix.

Scalar means it operates on a single "element", its what "element" is that can vary.

In the majority case an "element" is just float or int or double For example: Vector128<float> is a vector of float scalars.
In the pairwise case, the "element" is pairs of values (e.g. float, float). For example: Vector128<float> when talking about Pairwise, is a vector of (float, float) scalars.

It is still a scalar and the definition hasn't changed. It still only operates on one element (one pair).
It is how you have to infer this when referring to the actual ARM instructions (such as FINMP (scalar) which is Floating-point Minimum of Pair of elements (scalar)).

This never came up for x86 because they don't have horizontal (pairwise) instructions that only take one input.

tannergooding on 31 Jan 2020

If we had the proper support for it, the technically correct signature would be: Vector64<float> MaxNumericPairwiseScalar(Vector64<(float, float)> value); but that is a lot of complexity that is likely not worthwhile.

It's also worth noting users familiar with ARM assembly would likely not look for it under Across since their is no across instruction for it, even if they are functionally equivalent.

tannergooding on 31 Jan 2020

@tannergooding - I disagree. A pair is a vector not a scalar, albeit a smaller vector. I think the Scalar suffix is confusing enough without having it refer to pairs as well. I really think that we need some API design review of the naming conventions for this.

CarolEidt on 31 Jan 2020

👍2

Scalar suffix could be reserved for cases which consume a vector, but produce a scalar return value. As opposed to a vector<scalar> return value.

sdmaclea on 31 Jan 2020

If we had the proper support for it, the technically correct signature would be: Vector64 MaxNumericPairwiseScalar(Vector64<(float, float)> value); but that is a lot of complexity that is likely not worthwhile.

I disagree, if we had proper support I would have expected float MaxNumericPairwise(Vector64<float>) as it's intention is to just add two "pairs" inside a single vector. or rather two adjacent entries. https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?search=vpadds_f32

Broadly speaking the ISA splits instructions into tree classes Vector, Scalar and By Element.
So far we've adhered to adding the Scalar suffix to instructions that the ISA quite literally placed in the Scalar category. i.e. the ones where in the ArmARM is has (scalar) after the instruction.

I think that's the clearest distinction. I was mistaken before when I suggested renaming the reductions (deleted the comment since I accidentally made it from the wrong account).

It's also worth noting users familiar with ARM assembly would likely not look for it under Across since their is no across instruction for it, even if they are functionally equivalent.

All of these names already deviate somewhat from the mnemonic though. I though the explicit intention was to make more descriptive names for them. I would expect a user to search the docs for the mnemonic they want to find what to use, isn't that partially why we're adding them to the docs?

TamarChristinaArm on 31 Jan 2020

Scalar suffix could be reserved for cases which consume a vector, but produce a scalar return value. As opposed to a vector return value.

It is too late for that, x86 already has shipped and has things like Vector128<float> Sse.AddScalar(Vector128<float> left, Vector128<float> right)
On x86, this is important because x86 preserves the upper bits and returning or taking float would mean the entire value isn't preserved.

On ARM, the upper bits are zeroed, but the API design decisions would then be inconsistent.

I really think that we need some API design review of the naming conventions for this.

I agree.

A pair is a vector not a scalar, albeit a smaller vector. I think the Scalar suffix is confusing enough without having it refer to pairs as well.

I think that depends on how you look at it. We may be able to treat it as "across" in this particular case (Vector64<float>) but we wouldn't be able to treat it as across for Vector64<half> (the entire vector isn't consumed, just the lowest scalar pair)

tannergooding on 31 Jan 2020

I though the explicit intention was to make more descriptive names for them. I would expect a user to search the docs for the mnemonic they want to find what to use, isn't that partially why we're adding them to the docs

Yes, but we have also not had to deviate from the descriptions/names so far and have fallen back to them in past API reviews when naming concerns came up.

I believe the case of the half-precision variant for FMINNMP will likely be a big deciding factor in the name. It can't be named Across (or AcrossScalar) and has the same issue (it operates on a scalar pair and returns a scalar value).

tannergooding on 31 Jan 2020

I think that depends on how you look at it. We may be able to treat it as "across" in this particular case (Vector64) but we wouldn't be able to treat it as across for Vector64 (the entire vector isn't consumed, just the lowest scalar pair)

true, FP16 will add a lot of confusion to this story. My preference is to have both. Is there any downside to that? Operationally and logically it's sound for float and double.

TamarChristinaArm on 31 Jan 2020

@TamarChristinaArm Just to confirm you mean both intrinsics
c# Vector64<float> MaxNumericAcross(Vector64<float> value) Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value)
mapping to MAXNMP?

echesakovMSFT on 31 Jan 2020

@echesakovMSFT Yeah exactly. Though the first one should be MaxAcross to match the rest. (actually wonder if we need the Numeric in the pair operations but that's a different topic)

TamarChristinaArm on 31 Jan 2020

It can't be MaxAcross, the floating-point versions have both Max* and MaxNumeric* and they are different.
The former propagates NaN and the latter does not (each is compliant with a different IEEE 754 operation).

tannergooding on 31 Jan 2020

@TamarChristinaArm I though MaxNumericAcross maps to FMAXNMV for Vector128<float> value? So it would be logical to name FMAXNMP the same way?

echesakovMSFT on 31 Jan 2020

@tannergooding

I believe the case of the half-precision variant for FMINNMP will likely be a big deciding factor in the name. It can't be named Across (or AcrossScalar) and has the same issue (it operates on a scalar pair and returns a scalar value).

I am not sure if this is true. I don't see fmaxnmp that operates on float16x2_t (is there even a type like this?) meaning it operates on full 64-bit vector register and returns scalar float16.

If I were to define one it would be
c# Vector64<float16> MaxNumericAcross(Vector64<float16> value)

echesakovMSFT on 31 Jan 2020

@echesakovMSFT @tannergooding You're right, I forgot that MaxNumericAcross maps to FMAXNMV.

So you were correct with

Vector64<float> MaxNumericAcross(Vector64<float> value)
Vector64<float> MaxNumericPairwiseScalar(Vector64<float> value)

MaxAcross should be FMAXV.

TamarChristinaArm on 31 Jan 2020

I am not sure if this is true. I don't see fmaxnmp that operates on float16x2_t (is there even a type like this?) meaning it operates on full 64-bit vector register and returns scalar float16.

No we don't have a float16x2_t but the size modifier on the instruction is .2H so it only consumes the lower two elements.

TamarChristinaArm on 31 Jan 2020

@echesakovMSFT, I don't believe it is on the ARM Neon Intrinsics page (possibly because it is an ARMv8.2 instruction).

It is, however, detailed in the architecture manual (including confirming the operation and instruction) and is different from the non-scalar version which operates on the full vector.

tannergooding on 31 Jan 2020

@TamarChristinaArm @tannergooding I see it now it the isa manual, thanks

echesakovMSFT on 31 Jan 2020

@echesakovMSFT, I don't believe it is on the ARM Neon Intrinsics page (possibly because it is an ARMv8.2 instruction).

The neon pages go all the way to Armv8.6, I think it's just we never defined it for ACLE. not sure why not.

TamarChristinaArm on 31 Jan 2020

👍1

Another question. For multiply-add and fused multiply-add there are by element and vector forms. For example,
c# static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, Vector64<float> right); static Vector64<float> FusedMultiplyAdd(Vector64<float> acc, Vector64<float> left, float right);
The last one is implemented with FMLA <Vd>.<T>, <Vn>.<T>, <Vm>.<Ts>[<index>] where index is 0, right?

echesakovMSFT on 31 Jan 2020

Would someone ever need the following one?
c# static Vector64<float> FusedMultiplyAddByElement(Vector64<float> acc, Vector64<float> left, Vector64<float> right, byte index);

echesakovMSFT on 1 Feb 2020

@echesakovMSFT, looks like it functionally does: op3[..] + op1[..] * op2[index] (where .. is from 0 to Count for the vector version and just for 0 for the scalar version).

This would be acc + left * right -- the encoded operands and the order listed in the above function differ

I would say it is a reasonable thing to need and avoids the need to permute the value throughout a register.
It would be particularly useful when you have several constants you are using and you want to reduce the total number of registers being consumed.

tannergooding on 1 Feb 2020

The last one is implemented with FMLA ., ., .[] where index is 0, right?

Yup that's correct @echesakovMSFT

TamarChristinaArm on 3 Feb 2020

The last one is implemented with FMLA ., ., .[] where index is 0, right?

Why are we exposing one that hardcodes the index as zero, rather than exposing the full functionality of the instruction (I don't see an instruction that does so, just the one that takes an index)?

tannergooding on 3 Feb 2020

The last one is implemented with FMLA ., ., .[] where index is 0, right?

Why are we exposing one that hardcodes the index as zero, rather than exposing the full functionality of the instruction (I don't see an instruction that does so, just the one that takes an index)?

It's a convenience function for an often used function. It doesn't replace the index one it's just in addition to. At the time I wrote these I noticed we didn't expose any of the index variants so I didn't add them as I only updated the given list to fit the new scheme.

As for why it's useful? because otherwise you'd have to create the vector type first, insert the element and rely on the optimizers to optimize the vector creation away.

In C this would be the difference between

float32x4_t f(float32x4_t a, float32x4_t b, float32_t c)
{
  return vfmaq_n_f32 (a, b, c);
}

float32x4_t g(float32x4_t a, float32x4_t b, float32_t c)
{
  float32x4_t tmp;
  tmp = vsetq_lane_f32 (c, tmp, 0);
  return vfmaq_laneq_f32 (a, b, tmp, 0);
}

TamarChristinaArm on 3 Feb 2020

For the latter, we have Vector128.CreateScalarUnsafe which is basically treated as a "nop" for types (like float/double) which are already in the correct register type.

tannergooding on 3 Feb 2020

Right, but that's somewhat harder to find isn't it? Ultimately I also think intrinsics should make things easy and exposing that overload or any of the _n_ ones in ACLE make them easier to use.

TamarChristinaArm on 3 Feb 2020

Right, but that's somewhat harder to find isn't it

I think its just a question of getting familiar with the APIs. It's the only way to do such operations with the x86 APIs and is likely the pattern that will get the most scrutiny and optimizations (since it is more generally applicable and not specific to a particular intrinsic, instruction, or API).

I also think intrinsics should make things easy and exposing that overload or any of the _n_ ones in ACLE make them easier to use.

I don't disagree. There are likely several scenarios where helper methods for common patterns may be beneficial. However, we also have only exposed a very limited number of helper methods so far and it would need to be something we take through API review and consider separately from the mainline API (most of these helper methods are trivial for users to write in terms of the "core" API).

tannergooding on 3 Feb 2020

Fair enough.

I think its just a question of getting familiar with the APIs. It's the only way to do such operations with the x86 APIs and is likely the pattern that will get the most scrutiny and optimizations (since it is more generally applicable and not specific to a particular intrinsic, instruction, or API).

That does somewhat concern me. They are platform intrinsics after all, so whether or not something is available on x86 shouldn't factor in.

TamarChristinaArm on 3 Feb 2020

so whether or not something is available on x86 shouldn't factor in

x86 likewise has "scalar" instructions and could have exposed APIs that directly took a float; but it was determined to be better to have a pattern based approach around Vector128.CreateScalarUnsafe and Vector128.ToScalar which allows you to efficiently do this in a platform agnostic way. It significantly cut down on the number of overloads we needed to expose for these scalar APIs and still ensures efficient codegen.

There are a few platform agnostic helper methods located in the Vector64/Vector128/Vector256 classes (they exist separately from Vector64<T>/Vector128<T>/Vector256<T>). They expose a number of APIs which allow you to interact with the types even if hardware acceleration isn't available (which can be useful for a software fallback and debugging purposes). The APIs exposed are:

As and As* (e.g. AsByte, AsDouble, etc). These allow a reinterpret cast from a type T to a type U and are functionally a nop
Create. These allow constructing a vector of type T. There are both "broadcast" and per element initializers
CreateScalar. Allows constructing a vector of type T where the lowest element is set and the upper elements are 0
CreateScalarUnsafe. Allows constructing a vector of type T where the lowest element is set and the upper elements are non-deterministic (this allows conversion from float to Vector*<float> at zero cost since they both live in the same register kind; for example)
GetElement and WithElement. Allows getting/setting the given element of the vector (valid indices are from 0 to Count)
GetLower/GetUpper and WithLower/WithUpper. Allows getting/setting the upper/lower Vector (Vector128<T> is "comprised" of two Vector64<T>, for example)
ToScalar. Allows conversion from Vector128<T> to T. For types like float, this can be a nop.
ToVector256. Allows upcasting from Vector64<T>/Vector128<T> to Vector128<T>/Vector256<T> (respectively) with explicit zeroing of the upper bits
ToVector256Unsafe. Allows upcasting from Vector64<T>/Vector128<T> to Vector128<T>/Vector256<T> (respectively) leaving the upper bits "non-deterministic" (allows a nop on some platforms)

These functions are common and necessary even for the software fallback case to be able to correctly interact with the types. They can also have varying implementations based on the what ISAs are available (e.g. on x86, you may want to use broadcast, permute, or shuffle to create a vector with all elements set to a given value; depending on what hardware functionality is available).

tannergooding on 3 Feb 2020

As per https://github.com/dotnet/runtime/pull/31899#discussion_r376563233, we should discuss the ordering of parameters for FMA when those are reviewed.

tannergooding on 10 Feb 2020

The Min/MaxNumeric functions should probably be Min/MaxNumber to match the IEEE and instruction names (and the proposed names for the equivalent Math/MathF functions).

I think a couple of the *Estimate functions are meant to be *Step based on the underlying instruction name?

tannergooding on 13 Feb 2020

Check whether MaxPairwise/MinPairwise should also include forms for Vector128
Should the arguments for ReciprocalStep and ReciprocalSquareRootStep be something more specific than left and right?
Looks like ReverseElementBytes needs more work, as well as clarity on support
We skipped Arm32 because it's not going to be implemented for .NET 5
ExtractVector shouldn't have overloads for float and double, it could end up silently modifying/normalizing/corrupting the floating point types
MaxNumericPairwiseScalar should be MaxNumbercPairwiseScalar. Some folks raised concerns around PairwiseScalar being confusing, but it matches the ISA name and we can't think of a better name
We didn't review all the APIs (I commented the ones below). Tanner will see whether they are just applying a pattern or a net-new APIs, in which case we'll take another look.

```C#
namespace System.Runtime.Intrinsics.Arm
{
public partial class AdvSimd
{
///

/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
///

public static Vector64 AbsoluteCompareGreaterThanOrEqual(Vector64 left, Vector64 right);
public static Vector128 AbsoluteCompareGreaterThanOrEqual(Vector128 left, Vector128 right);

    /// <summary>
    /// Vector CompareGreaterThan
    ///
    /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
    ///
    /// Corresponds to vector forms of ARM64 FACGT
    /// </summary>
    public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector absolute difference
    /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
    /// </summary>
    public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right);
    public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right);
    public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector absolute difference add
    ///
    /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
    ///
    /// Corresponds to vector forms of ARM64 SABA, UABA
    /// </summary>
    public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
    public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

    /// <summary>
    /// Vector add pairwise
    /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
    /// Corresponds to vector forms of ARM64 ADDP, and FADDP
    /// </summary>
    public static Vector64<byte>   AddPairwise(Vector64<byte>  left, Vector64<byte>  right) ;
    public static Vector64<sbyte>  AddPairwise(Vector64<sbyte>  left, Vector64<sbyte>  right) ;
    public static Vector64<ushort> AddPairwise(Vector64<ushort>  left, Vector64<ushort>  right) ;
    public static Vector64<short>  AddPairwise(Vector64<short>  left, Vector64<short>  right) ;
    public static Vector64<int>    AddPairwise(Vector64<int>  left, Vector64<int>  right) ;
    public static Vector64<uint>   AddPairwise(Vector64<uint>  left, Vector64<uint>  right) ;
    public static Vector64<float>  AddPairwise(Vector64<float>  left, Vector64<float>  right) ;

    /// <summary>
    /// Vector extract from pair of vectors
    /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
    ///
    /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
    ///
    /// Corresponds to vector forms of ARM64 EXT
    /// </summary>
    public static Vector64<byte>   ExtractVector64(Vector64<byte>  upper, Vector64<byte>  lower, byte byteIndex);
    public static Vector64<sbyte>  ExtractVector64(Vector64<sbyte>  upper, Vector64<sbyte>  lower, byte byteIndex);
    public static Vector64<short>  ExtractVector64(Vector64<short>  upper, Vector64<short>  lower, byte byteIndex);
    public static Vector64<ushort> ExtractVector64(Vector64<ushort>  upper, Vector64<ushort>  lower, byte byteIndex);
    public static Vector64<int>    ExtractVector64(Vector64<int>  upper, Vector64<int>  lower, byte byteIndex);
    public static Vector64<uint>   ExtractVector64(Vector64<uint>  upper, Vector64<uint>  lower, byte byteIndex);

    public static Vector128<byte>   ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
    public static Vector128<sbyte>  ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
    public static Vector128<short>  ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
    public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
    public static Vector128<int>    ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
    public static Vector128<uint>   ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
    public static Vector128<long>   ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
    public static Vector128<ulong>  ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
    public static Vector128<float>  ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
    public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);

    /// <summary>
    /// Vector max numeric
    /// Corresponds to vector forms of ARM64 FMAXNM
    /// </summary>
    public static Vector64<float>   MaxNumber(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  MaxNumber(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector max pairwise
    ///
    /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
    ///
    /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
    /// </summary>
    public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right);
    public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right);

    /// <summary>
    /// Vector min numeric
    /// Corresponds to vector forms of ARM64 FMINNM
    /// </summary>
    public static Vector64<float>   MinNumber(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  MinNumber(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector min pairwise
    ///
    /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
    ///
    /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
    /// </summary>
    public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right);
    public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right);

    /// <summary>
    /// Vector multiply add
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 MLA
    /// </summary>
    public static Vector64<byte>    MultiplyAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MultiplyAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MultiplyAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MultiplyAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
    public static Vector128<byte>   MultiplyAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> MultiplyAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<short>  MultiplyAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   MultiplyAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<int>    MultiplyAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

    /// <summary>
    /// Vector multiply add by element
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right
    ///
    /// Corresponds to vector forms of ARM64 MLA
    /// </summary>
    public static Vector64<byte>    MultiplyAddBySelectedScalar(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
    public static Vector64<sbyte>   MultiplyAddBySelectedScalar(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
    public static Vector64<ushort>  MultiplyAddBySelectedScalar(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
    public static Vector64<short>   MultiplyAddBySelectedScalar(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
    public static Vector64<uint>    MultiplyAddBySelectedScalar(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
    public static Vector64<int>     MultiplyAddBySelectedScalar(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
    public static Vector128<byte>   MultiplyAddBySelectedScalar(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
    public static Vector128<sbyte>  MultiplyAddBySelectedScalar(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
    public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
    public static Vector128<short>  MultiplyAddBySelectedScalar(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
    public static Vector128<uint>   MultiplyAddBySelectedScalar(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
    public static Vector128<int>    MultiplyAddBySelectedScalar(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

    /// <summary>
    /// Vector multiply subtract
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 MLS
    /// </summary>
    public static Vector64<byte>    MultiplySubtract(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MultiplySubtract(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MultiplySubtract(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MultiplySubtract(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right);
    public static Vector128<byte>   MultiplySubtract(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> MultiplySubtract(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<short>  MultiplySubtract(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   MultiplySubtract(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<int>    MultiplySubtract(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right);

    /// <summary>
    /// Vector multiply subtract by element
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right
    ///
    /// Corresponds to vector forms of ARM64 MLS
    /// </summary>
    public static Vector64<byte>    MultiplySubtractBySelectedScalar(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
    public static Vector64<sbyte>   MultiplySubtractBySelectedScalar(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
    public static Vector64<ushort>  MultiplySubtractBySelectedScalar(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
    public static Vector64<short>   MultiplySubtractBySelectedScalar(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
    public static Vector64<uint>    MultiplySubtractBySelectedScalar(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
    public static Vector64<int>     MultiplySubtractBySelectedScalar(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
    public static Vector128<byte>   MultiplySubtractBySelectedScalar(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
    public static Vector128<sbyte>  MultiplySubtractBySelectedScalar(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
    public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
    public static Vector128<short>  MultiplySubtractBySelectedScalar(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
    public static Vector128<uint>   MultiplySubtractBySelectedScalar(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
    public static Vector128<int>    MultiplySubtractBySelectedScalar(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

    /// <summary>
    /// Vector fused multiply add
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 FMLA
    /// </summary>
    public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   addend, Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  addend, Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector fused multiply subtract
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 FMLS
    /// </summary>
    public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   minuend, Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  minuend, Vector128<float>  left, Vector128<float>  right);


    /// <summary>
    /// Vector polynomial multiply
    /// Corresponds to vector forms of ARM64 PMUL
    /// </summary>
    public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right);

    /// Vector reciprocal estimate
    ///
    /// See FRECPE docs
    ///
    /// Corresponds to vector forms of ARM64 FRECPE
    /// </summary>
    public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value);
    public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value);

    /// <summary>
    /// Vector reciprocal step
    ///
    /// See FRECPS docs
    ///
    /// Corresponds to vector forms of ARM64 FRECPS
    /// </summary>
    public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector reciprocal square root estimate
    ///
    /// See FRSQRTE docs
    ///
    /// Corresponds to vector forms of ARM64 FRSQRTE
    /// </summary>
    public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value);
    public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value);

    /// <summary>
    /// Vector reciprocal square root step
    ///
    /// See FRSQRTS docs
    ///
    /// Corresponds to vector forms of ARM64 FRSQRTS
    /// </summary>
    public static Vector64<float>   ReciprocalSquareRootStep(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  ReciprocalSquareRootStep(Vector128<float>  left, Vector128<float>  right);

    public partial class Arm64
    {
        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
        /// </summary>
        public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP, and FADDP
        /// </summary>
        public static Vector128<byte>   AddPairwise(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  AddPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> AddPairwise(Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  AddPairwise(Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   AddPairwise(Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    AddPairwise(Vector128<int>    left, Vector128<int>    right);
        public static Vector128<long>   AddPairwise(Vector128<long>   left, Vector128<long>   right);
        public static Vector128<ulong>  AddPairwise(Vector128<ulong>  left, Vector128<ulong>  right);
        public static Vector128<float>  AddPairwise(Vector128<float>  left, Vector128<float>  right);
        public static Vector128<double> AddPairwise(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector128<double> ExtractVector(Vector128<double> left, Vector128<double> right, byte index);

        /// <summary>
        /// Vector add across vector elements
        /// Corresponds to vector forms of ARM64 ADDV
        /// </summary>
        public static Vector64<byte>   AddAcross(Vector64<byte>    value);
        public static Vector64<sbyte>  AddAcross(Vector64<sbyte>   value);
        public static Vector64<ushort> AddAcross(Vector64<ushort>  value);
        public static Vector64<short>  AddAcross(Vector64<short>   value);
        public static Vector64<uint>   AddAcross(Vector64<uint>    value);
        public static Vector64<int>    AddAcross(Vector64<int>     value);
        public static Vector64<byte>   AddAcross(Vector128<byte>   value);
        public static Vector64<sbyte>  AddAcross(Vector128<sbyte>  value);
        public static Vector64<ushort> AddAcross(Vector128<ushort> value);
        public static Vector64<short>  AddAcross(Vector128<short>  value);
        public static Vector64<uint>   AddAcross(Vector128<uint>   value);
        public static Vector64<int>    AddAcross(Vector128<int>    value);

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector128<double> MaxNumber(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector max numeric pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 FMAXNMP
        /// </summary>
        public static Vector64<float>   MaxNumberPairwise(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  MaxNumberPairwise(Vector128<float>  left, Vector128<float>  right);
        public static Vector128<double> MaxNumberPairwise(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector max numeric across
        ///
        /// result = max(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 FMAXNMV
        /// </summary>
        public static Vector64<float> MaxNumberAcross(Vector128<float>  value);

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
        /// </summary>
        public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right);
        public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right);
        public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector max across
        ///
        /// result = max(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 SMAXV, UMAXV, and FMAXV
        /// </summary>
        public static Vector64<byte>   MaxAcross(Vector64<byte>    value);
        public static Vector64<sbyte>  MaxAcross(Vector64<sbyte>   value);
        public static Vector64<ushort> MaxAcross(Vector64<ushort>  value);
        public static Vector64<short>  MaxAcross(Vector64<short>   value);
        public static Vector64<uint>   MaxAcross(Vector64<uint>    value);
        public static Vector64<int>    MaxAcross(Vector64<int>     value);
        public static Vector64<float>  MaxAcross(Vector64<float>   value);
        public static Vector64<byte>   MaxAcross(Vector128<byte>   value);
        public static Vector64<sbyte>  MaxAcross(Vector128<sbyte>  value);
        public static Vector64<ushort> MaxAcross(Vector128<ushort> value);
        public static Vector64<short>  MaxAcross(Vector128<short>  value);
        public static Vector64<uint>   MaxAcross(Vector128<uint>   value);
        public static Vector64<int>    MaxAcross(Vector128<int>    value);
        public static Vector64<ulong>  MaxAcross(Vector128<ulong>  value);
        public static Vector64<long>   MaxAcross(Vector128<long>   value);
        public static Vector64<float>  MaxAcross(Vector128<float>  value);
        public static Vector64<double> MaxAcross(Vector128<double> value);

// Not reviewed:
//
// ///

// /// Vector min numeric
// /// Corresponds to vector forms of ARM64 FMINNM
// ///

// public static Vector128 MinNumber(Vector128 left, Vector128 right);
//
// ///

// /// Vector min numeric pairwise
// ///
// /// For each element result[elem] = 2elem < result.Length ? min(left[2elem], left[2byte + 1]) : min(right[2byte - result.Length], right[2byte + 1 - result.Length])
// ///
// /// Corresponds to vector forms of ARM64 FMINNMP
// ///

// public static Vector64 MinNumberPairwise(Vector64 left, Vector64 right);
// public static Vector128 MinNumberPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinNumberPairwise(Vector128 left, Vector128 right);
//
// ///

// /// Vector min numeric across
// ///
// /// result = min(value[0], ... , value[length -1])
// ///
// /// Corresponds to vector forms of ARM64 FMINNMV
// ///

// public static float MinNumberAcross(Vector128 value);
//
// ///

// /// Vector min pairwise
// ///
// /// For each element result[elem] = 2elem < result.Length ? min(left[2elem], left[2byte + 1]) : min(right[2byte - result.Length], right[2byte + 1 - result.Length])
// ///
// /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
// ///

// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
// public static Vector128 MinPairwise(Vector128 left, Vector128 right);
//
// ///

// /// Vector min across
// ///
// /// result = max(value[0], ... , value[length -1])
// ///
// /// Corresponds to vector forms of ARM64 SMINV, UMINV, and FMINV
// ///

// public static byte MinAcross(Vector64 value);
// public static sbyte MinAcross(Vector64 value);
// public static ushort MinAcross(Vector64 value);
// public static short MinAcross(Vector64 value);
// public static uint MinAcross(Vector64 value);
// public static int MinAcross(Vector64 value);
// public static float MinAcross(Vector64 value);
// public static byte MinAcross(Vector128 value);
// public static sbyte MinAcross(Vector128 value);
// public static ushort MinAcross(Vector128 value);
// public static short MinAcross(Vector128 value);
// public static uint MinAcross(Vector128 value);
// public static int MinAcross(Vector128 value);
// public static float MinAcross(Vector128 value);
// public static double MinAcross(Vector128 value);
//
// ///

// /// Vector fused multiply add
// ///
// /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
// ///
// /// Corresponds to vector forms of ARM64 FMLA
// ///

// public static Vector128 FusedMultiplyAdd(Vector128 acc, Vector128 left, Vector128 right);
//
// ///

// /// Vector fused multiply add by element
// ///
// /// For each element result[elem] = acc[elem] + left[elem] * right
// ///
// /// Corresponds to vector forms of ARM64 FMLA
// ///

// public static Vector64 FusedMultiplyAdd(Vector64 acc, Vector64 left, float right);
// public static Vector128 FusedMultiplyAdd(Vector128 acc, Vector128 left, float right);
// public static Vector128 FusedMultiplyAdd(Vector128 acc, Vector128 left, float right);
//
// ///

// /// Vector fused multiply subtract
// ///
// /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
// ///
// /// Corresponds to vector forms of ARM64 FMLS
// ///

// public static Vector128 FusedMultiplySubtract(Vector128 acc, Vector128 left, Vector128 right);
//
// ///

// /// Vector fused multiply subtract by element
// ///
// /// For each element result[elem] = acc[elem] - left[elem] * right
// ///
// /// Corresponds to vector forms of ARM64 FMLS
// ///

// public static Vector64 FusedMultiplySubtract(Vector64 acc, Vector64 left, float right);
// public static Vector128 FusedMultiplySubtract(Vector128 acc, Vector128 left, float right);
// public static Vector128 FusedMultiplySubtract(Vector128 acc, Vector128 left, float right);
//
// ///

// /// Vector multiply extend
// ///
// /// For each element result[elem] = left[elem] * right[elem]
// /// Handle extend special cases zero and infinite. FMULX
// ///
// /// Corresponds to vector forms of ARM64 FMULX
// ///

// public static Vector64 MultiplyExtend(Vector64 left, Vector64 right);
// public static Vector128 MultiplyExtend(Vector128 left, Vector128 right);
// public static Vector128 MultiplyExtend(Vector128 left, Vector128 right);
//
// ///

// /// Vector multiply extend by element
// ///
// /// For each element result[elem] = left[elem] * right
// /// Handle extend special cases zero and infinite. FMULX
// ///
// /// Corresponds to vector forms of ARM64 FMULX
// ///

// public static Vector64 MultiplyExtend(Vector64 left, float right);
// public static Vector128 MultiplyExtend(Vector128 left, float right);
// public static Vector128 MultiplyExtend(Vector128 left, double right);
//
// /// Vector reciprocal estimate
// ///
// /// See FRECPE docs
// ///
// /// Corresponds to vector forms of ARM64 FRECPE
// ///
// public static Vector128 ReciprocalEstimate(Vector128 value);
//
// ///

// /// Vector reciprocal step
// ///
// /// See FRECPS docs
// ///
// /// Corresponds to vector forms of ARM64 FRECPS
// ///

// public static Vector128 ReciprocalStep(Vector128 left, Vector128 right, byte index);
//
// ///

// /// Vector reciprocal square root estimate
// ///
// /// See FRSQRTE docs
// ///
// /// Corresponds to vector forms of ARM64 FRSQRTE
// ///

// public static Vector128 ReciprocalSquareRootEstimate(Vector128 value);
//
// ///

// /// Vector reciprocal square root step
// ///
// /// See FRSQRTS docs
// ///
// /// Corresponds to vector forms of ARM64 FRSQRTS
// ///

// public static Vector128 ReciprocalSquareRootEstimate(Vector128 left, Vector128 right, byte index);
//
// ///

// /// Vector reverse byte bits
// /// Corresponds to vector forms of ARM64 RBIT
// ///

// public static Vector64 ReverseElementBits(Vector64 value);
// public static Vector64 ReverseElementBits(Vector64 value);
// public static Vector128 ReverseElementBits(Vector128 value);
// public static Vector128 ReverseElementBits(Vector128 value);
}
}
}

terrajobst on 18 Feb 2020

We finished the review today. This comment repeast all the APIs and comments.
Check whether MaxPairwise/MinPairwise should also include forms for Vector128
Should the arguments for ReciprocalStep and ReciprocalSquareRootStep be something more specific than left and right?
Looks like ReverseElementBytes needs more work, as well as clarity on support
We skipped Arm32 because it's not going to be implemented for .NET 5
ExtractVector shouldn't have overloads for float and double, it could end up silently modifying/normalizing/corrupting the floating point types
MaxNumericPairwiseScalar should be MaxNumberPairwiseScalar. Some folks raised concerns around PairwiseScalar being confusing, but it matches the ISA name and we can't think of a better name
We didn't review all the APIs (I commented the ones below). Tanner will see whether they are just applying a pattern or a net-new APIs, in which case we'll take another look.

```C#
namespace System.Runtime.Intrinsics.Arm
{
public partial class AdvSimd
{
///

/// Vector CompareGreaterThanOrEqual
/// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
/// Corresponds to vector forms of ARM64 FACGE
///

public static Vector64 AbsoluteCompareGreaterThanOrEqual(Vector64 left, Vector64 right);
public static Vector128 AbsoluteCompareGreaterThanOrEqual(Vector128 left, Vector128 right);

    /// <summary>
    /// Vector CompareGreaterThan
    ///
    /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
    ///
    /// Corresponds to vector forms of ARM64 FACGT
    /// </summary>
    public static Vector64<float>   AbsoluteCompareGreaterThan(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  AbsoluteCompareGreaterThan(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector absolute difference
    /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
    /// </summary>
    public static Vector64<byte>    AbsoluteDifference(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<byte>    AbsoluteDifference(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  AbsoluteDifference(Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<ushort>  AbsoluteDifference(Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    AbsoluteDifference(Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<uint>    AbsoluteDifference(Vector64<int>     left, Vector64<int>     right);
    public static Vector64<float>   AbsoluteDifference(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<byte>   AbsoluteDifference(Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<byte>   AbsoluteDifference(Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> AbsoluteDifference(Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<ushort> AbsoluteDifference(Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   AbsoluteDifference(Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<uint>   AbsoluteDifference(Vector128<int>    left, Vector128<int>    right);
    public static Vector128<float>  AbsoluteDifference(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector absolute difference add
    ///
    /// For each element result[elem] = acc[elem] + | left[elem] - right[elem] |
    ///
    /// Corresponds to vector forms of ARM64 SABA, UABA
    /// </summary>
    public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<byte>    AbsoluteDifferenceAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<ushort>  AbsoluteDifferenceAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<uint>    AbsoluteDifferenceAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
    public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<byte>   AbsoluteDifferenceAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<ushort> AbsoluteDifferenceAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<uint>   AbsoluteDifferenceAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

    /// <summary>
    /// Vector add pairwise
    /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
    /// Corresponds to vector forms of ARM64 ADDP, and FADDP
    /// </summary>
    public static Vector64<byte>   AddPairwise(Vector64<byte>  left, Vector64<byte>  right) ;
    public static Vector64<sbyte>  AddPairwise(Vector64<sbyte>  left, Vector64<sbyte>  right) ;
    public static Vector64<ushort> AddPairwise(Vector64<ushort>  left, Vector64<ushort>  right) ;
    public static Vector64<short>  AddPairwise(Vector64<short>  left, Vector64<short>  right) ;
    public static Vector64<int>    AddPairwise(Vector64<int>  left, Vector64<int>  right) ;
    public static Vector64<uint>   AddPairwise(Vector64<uint>  left, Vector64<uint>  right) ;
    public static Vector64<float>  AddPairwise(Vector64<float>  left, Vector64<float>  right) ;

    /// <summary>
    /// Vector extract from pair of vectors
    /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
    ///
    /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
    ///
    /// Corresponds to vector forms of ARM64 EXT
    /// </summary>
    public static Vector64<byte>   ExtractVector64(Vector64<byte>  upper, Vector64<byte>  lower, byte byteIndex);
    public static Vector64<sbyte>  ExtractVector64(Vector64<sbyte>  upper, Vector64<sbyte>  lower, byte byteIndex);
    public static Vector64<short>  ExtractVector64(Vector64<short>  upper, Vector64<short>  lower, byte byteIndex);
    public static Vector64<ushort> ExtractVector64(Vector64<ushort>  upper, Vector64<ushort>  lower, byte byteIndex);
    public static Vector64<int>    ExtractVector64(Vector64<int>  upper, Vector64<int>  lower, byte byteIndex);
    public static Vector64<uint>   ExtractVector64(Vector64<uint>  upper, Vector64<uint>  lower, byte byteIndex);

    public static Vector128<byte>   ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
    public static Vector128<sbyte>  ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
    public static Vector128<short>  ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
    public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
    public static Vector128<int>    ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
    public static Vector128<uint>   ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
    public static Vector128<long>   ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
    public static Vector128<ulong>  ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
    public static Vector128<float>  ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
    public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);

    /// <summary>
    /// Vector max numeric
    /// Corresponds to vector forms of ARM64 FMAXNM
    /// </summary>
    public static Vector64<float>   MaxNumber(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  MaxNumber(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector max pairwise
    ///
    /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
    ///
    /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
    /// </summary>
    public static Vector64<byte>    MaxPairwise(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MaxPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MaxPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MaxPairwise(Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MaxPairwise(Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MaxPairwise(Vector64<int>     left, Vector64<int>     right);
    public static Vector64<float>   MaxPairwise(Vector64<float>   left, Vector64<float>   right);

    /// <summary>
    /// Vector min numeric
    /// Corresponds to vector forms of ARM64 FMINNM
    /// </summary>
    public static Vector64<float>   MinNumber(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  MinNumber(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector min pairwise
    ///
    /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
    ///
    /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
    /// </summary>
    public static Vector64<byte>    MinPairwise(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MinPairwise(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MinPairwise(Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MinPairwise(Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MinPairwise(Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MinPairwise(Vector64<int>     left, Vector64<int>     right);
    public static Vector64<float>   MinPairwise(Vector64<float>   left, Vector64<float>   right);

    /// <summary>
    /// Vector multiply add
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 MLA
    /// </summary>
    public static Vector64<byte>    MultiplyAdd(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MultiplyAdd(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MultiplyAdd(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MultiplyAdd(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MultiplyAdd(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MultiplyAdd(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right);
    public static Vector128<byte>   MultiplyAdd(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<sbyte>  MultiplyAdd(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> MultiplyAdd(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<short>  MultiplyAdd(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   MultiplyAdd(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<int>    MultiplyAdd(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right);

    /// <summary>
    /// Vector multiply add by element
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right
    ///
    /// Corresponds to vector forms of ARM64 MLA
    /// </summary>
    public static Vector64<byte>    MultiplyAddBySelectedScalar(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
    public static Vector64<sbyte>   MultiplyAddBySelectedScalar(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
    public static Vector64<ushort>  MultiplyAddBySelectedScalar(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
    public static Vector64<short>   MultiplyAddBySelectedScalar(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
    public static Vector64<uint>    MultiplyAddBySelectedScalar(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
    public static Vector64<int>     MultiplyAddBySelectedScalar(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
    public static Vector128<byte>   MultiplyAddBySelectedScalar(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
    public static Vector128<sbyte>  MultiplyAddBySelectedScalar(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
    public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
    public static Vector128<short>  MultiplyAddBySelectedScalar(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
    public static Vector128<uint>   MultiplyAddBySelectedScalar(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
    public static Vector128<int>    MultiplyAddBySelectedScalar(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

    /// <summary>
    /// Vector multiply subtract
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 MLS
    /// </summary>
    public static Vector64<byte>    MultiplySubtract(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   MultiplySubtract(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector64<ushort>  MultiplySubtract(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right);
    public static Vector64<short>   MultiplySubtract(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right);
    public static Vector64<uint>    MultiplySubtract(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right);
    public static Vector64<int>     MultiplySubtract(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right);
    public static Vector128<byte>   MultiplySubtract(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<sbyte>  MultiplySubtract(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right);
    public static Vector128<ushort> MultiplySubtract(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right);
    public static Vector128<short>  MultiplySubtract(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right);
    public static Vector128<uint>   MultiplySubtract(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right);
    public static Vector128<int>    MultiplySubtract(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right);

    /// <summary>
    /// Vector multiply subtract by element
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right
    ///
    /// Corresponds to vector forms of ARM64 MLS
    /// </summary>
    public static Vector64<byte>    MultiplySubtractBySelectedScalar(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
    public static Vector64<sbyte>   MultiplySubtractBySelectedScalar(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
    public static Vector64<ushort>  MultiplySubtractBySelectedScalar(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
    public static Vector64<short>   MultiplySubtractBySelectedScalar(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
    public static Vector64<uint>    MultiplySubtractBySelectedScalar(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
    public static Vector64<int>     MultiplySubtractBySelectedScalar(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
    public static Vector128<byte>   MultiplySubtractBySelectedScalar(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
    public static Vector128<sbyte>  MultiplySubtractBySelectedScalar(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
    public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
    public static Vector128<short>  MultiplySubtractBySelectedScalar(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
    public static Vector128<uint>   MultiplySubtractBySelectedScalar(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
    public static Vector128<int>    MultiplySubtractBySelectedScalar(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

    /// <summary>
    /// Vector fused multiply add
    ///
    /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 FMLA
    /// </summary>
    public static Vector64<float>   FusedMultiplyAdd(Vector64<float>   addend, Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  FusedMultiplyAdd(Vector128<float>  addend, Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector fused multiply subtract
    ///
    /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
    ///
    /// Corresponds to vector forms of ARM64 FMLS
    /// </summary>
    public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   minuend, Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  minuend, Vector128<float>  left, Vector128<float>  right);


    /// <summary>
    /// Vector polynomial multiply
    /// Corresponds to vector forms of ARM64 PMUL
    /// </summary>
    public static Vector64<byte>    PolynomialMultiply(Vector64<byte>    left, Vector64<byte>    right);
    public static Vector64<sbyte>   PolynomialMultiply(Vector64<sbyte>   left, Vector64<sbyte>   right);
    public static Vector128<byte>   PolynomialMultiply(Vector128<byte>   left, Vector128<byte>   right);
    public static Vector128<sbyte>  PolynomialMultiply(Vector128<sbyte>  left, Vector128<sbyte>  right);

    /// Vector reciprocal estimate
    ///
    /// See FRECPE docs
    ///
    /// Corresponds to vector forms of ARM64 FRECPE
    /// </summary>
    public static Vector64<float>   ReciprocalEstimate(Vector64<float>   value);
    public static Vector128<float>  ReciprocalEstimate(Vector128<float>  value);

    /// <summary>
    /// Vector reciprocal step
    ///
    /// See FRECPS docs
    ///
    /// Corresponds to vector forms of ARM64 FRECPS
    /// </summary>
    public static Vector64<float>   ReciprocalStep(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  ReciprocalStep(Vector128<float>  left, Vector128<float>  right);

    /// <summary>
    /// Vector reciprocal square root estimate
    ///
    /// See FRSQRTE docs
    ///
    /// Corresponds to vector forms of ARM64 FRSQRTE
    /// </summary>
    public static Vector64<float>   ReciprocalSquareRootEstimate(Vector64<float>   value);
    public static Vector128<float>  ReciprocalSquareRootEstimate(Vector128<float>  value);

    /// <summary>
    /// Vector reciprocal square root step
    ///
    /// See FRSQRTS docs
    ///
    /// Corresponds to vector forms of ARM64 FRSQRTS
    /// </summary>
    public static Vector64<float>   ReciprocalSquareRootStep(Vector64<float>   left, Vector64<float>   right);
    public static Vector128<float>  ReciprocalSquareRootStep(Vector128<float>  left, Vector128<float>  right);

    public partial class Arm64
    {
        /// <summary>
        /// Vector CompareGreaterThanOrEqual
        /// For each element result[elem] = (|left[elem]| >= |right[elem]|) ? ~0 : 0
        /// Corresponds to vector forms of ARM64 FACGE
        /// </summary>
        public static Vector128<double> AbsoluteCompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector CompareGreaterThan
        ///
        /// For each element result[elem] = (|left[elem]| > |right[elem]|) ? ~0 : 0
        ///
        /// Corresponds to vector forms of ARM64 FACGT
        /// </summary>
        public static Vector128<double> AbsoluteCompareGreaterThan(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector absolute difference
        /// Corresponds to vector forms of ARM64 SABD, UABD, and FABD
        /// </summary>
        public static Vector128<double> AbsoluteDifference(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector add pairwise
        /// For each byte result[byte] = 2*byte < result.Length ? (left[2*byte] + left[2*byte + 1]) : (right[2*byte - result.Length] + right[2*byte + 1 - result.Length])
        /// Corresponds to vector forms of ARM64 ADDP, and FADDP
        /// </summary>
        public static Vector128<byte>   AddPairwise(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  AddPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> AddPairwise(Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  AddPairwise(Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   AddPairwise(Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    AddPairwise(Vector128<int>    left, Vector128<int>    right);
        public static Vector128<long>   AddPairwise(Vector128<long>   left, Vector128<long>   right);
        public static Vector128<ulong>  AddPairwise(Vector128<ulong>  left, Vector128<ulong>  right);
        public static Vector128<float>  AddPairwise(Vector128<float>  left, Vector128<float>  right);
        public static Vector128<double> AddPairwise(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector128<double> ExtractVector(Vector128<double> left, Vector128<double> right, byte index);

        /// <summary>
        /// Vector add across vector elements
        /// Corresponds to vector forms of ARM64 ADDV
        /// </summary>
        public static Vector64<byte>   AddAcross(Vector64<byte>    value);
        public static Vector64<sbyte>  AddAcross(Vector64<sbyte>   value);
        public static Vector64<ushort> AddAcross(Vector64<ushort>  value);
        public static Vector64<short>  AddAcross(Vector64<short>   value);
        public static Vector64<uint>   AddAcross(Vector64<uint>    value);
        public static Vector64<int>    AddAcross(Vector64<int>     value);
        public static Vector64<byte>   AddAcross(Vector128<byte>   value);
        public static Vector64<sbyte>  AddAcross(Vector128<sbyte>  value);
        public static Vector64<ushort> AddAcross(Vector128<ushort> value);
        public static Vector64<short>  AddAcross(Vector128<short>  value);
        public static Vector64<uint>   AddAcross(Vector128<uint>   value);
        public static Vector64<int>    AddAcross(Vector128<int>    value);

        /// <summary>
        /// Vector max numeric
        /// Corresponds to vector forms of ARM64 FMAXNM
        /// </summary>
        public static Vector128<double> MaxNumber(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector max numeric pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 FMAXNMP
        /// </summary>
        public static Vector64<float>   MaxNumberPairwise(Vector64<float>   left, Vector64<float>   right);
        public static Vector128<float>  MaxNumberPairwise(Vector128<float>  left, Vector128<float>  right);
        public static Vector128<double> MaxNumberPairwise(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector max numeric across
        ///
        /// result = max(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 FMAXNMV
        /// </summary>
        public static Vector64<float> MaxNumberAcross(Vector128<float>  value);

        /// <summary>
        /// Vector max pairwise
        ///
        /// For each element result[elem] = 2*elem < result.Length ? max(left[2*elem], left[2*byte + 1]) : max(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
        ///
        /// Corresponds to vector forms of ARM64 SMAXP, UMAXP, and FMAXP
        /// </summary>
        public static Vector128<byte>   MaxPairwise(Vector128<byte>   left, Vector128<byte>   right);
        public static Vector128<sbyte>  MaxPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
        public static Vector128<ushort> MaxPairwise(Vector128<ushort> left, Vector128<ushort> right);
        public static Vector128<short>  MaxPairwise(Vector128<short>  left, Vector128<short>  right);
        public static Vector128<uint>   MaxPairwise(Vector128<uint>   left, Vector128<uint>   right);
        public static Vector128<int>    MaxPairwise(Vector128<int>    left, Vector128<int>    right);
        public static Vector128<float>  MaxPairwise(Vector128<float>  left, Vector128<float>  right);
        public static Vector128<double> MaxPairwise(Vector128<double> left, Vector128<double> right);

        /// <summary>
        /// Vector max across
        ///
        /// result = max(value[0], ... , value[length -1])
        ///
        /// Corresponds to vector forms of ARM64 SMAXV, UMAXV, and FMAXV
        /// </summary>
        public static Vector64<byte>   MaxAcross(Vector64<byte>    value);
        public static Vector64<sbyte>  MaxAcross(Vector64<sbyte>   value);
        public static Vector64<ushort> MaxAcross(Vector64<ushort>  value);
        public static Vector64<short>  MaxAcross(Vector64<short>   value);
        public static Vector64<uint>   MaxAcross(Vector64<uint>    value);
        public static Vector64<int>    MaxAcross(Vector64<int>     value);
        public static Vector64<float>  MaxAcross(Vector64<float>   value);
        public static Vector64<byte>   MaxAcross(Vector128<byte>   value);
        public static Vector64<sbyte>  MaxAcross(Vector128<sbyte>  value);
        public static Vector64<ushort> MaxAcross(Vector128<ushort> value);
        public static Vector64<short>  MaxAcross(Vector128<short>  value);
        public static Vector64<uint>   MaxAcross(Vector128<uint>   value);
        public static Vector64<int>    MaxAcross(Vector128<int>    value);
        public static Vector64<ulong>  MaxAcross(Vector128<ulong>  value);
        public static Vector64<long>   MaxAcross(Vector128<long>   value);
        public static Vector64<float>  MaxAcross(Vector128<float>  value);
        public static Vector64<double> MaxAcross(Vector128<double> value);

       // -------------------------------------------------
       // Reviewed today:
       // -------------------------------------------------


       /// <summary>
       /// Vector min numeric
       /// Corresponds to vector forms of ARM64 FMINNM
       /// </summary>
       public static Vector128<double> MinNumber(Vector128<double> left, Vector128<double> right);

       /// <summary>
       /// Vector min numeric pairwise
       ///
       /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
       ///
       /// Corresponds to vector forms of ARM64 FMINNMP
       /// </summary>
       public static Vector64<float>   MinNumberPairwise(Vector64<float>   left, Vector64<float>   right);
       public static Vector128<float>  MinNumberPairwise(Vector128<float>  left, Vector128<float>  right);
       public static Vector128<double> MinNumberPairwise(Vector128<double> left, Vector128<double> right);

       /// <summary>
       /// Vector min numeric across
       ///
       /// result = min(value[0], ... , value[length -1])
       ///
       /// Corresponds to vector forms of ARM64 FMINNMV
       /// </summary>
       public static float  MinNumberAcross(Vector128<float>  value);

       /// <summary>
       /// Vector min pairwise
       ///
       /// For each element result[elem] = 2*elem < result.Length ? min(left[2*elem], left[2*byte + 1]) : min(right[2*byte - result.Length], right[2*byte + 1 - result.Length])
       ///
       /// Corresponds to vector forms of ARM64 SMINP, UMINP, and FMINP
       /// </summary>
       public static Vector128<byte>   MinPairwise(Vector128<byte>   left, Vector128<byte>   right);
       public static Vector128<sbyte>  MinPairwise(Vector128<sbyte>  left, Vector128<sbyte>  right);
       public static Vector128<ushort> MinPairwise(Vector128<ushort> left, Vector128<ushort> right);
       public static Vector128<short>  MinPairwise(Vector128<short>  left, Vector128<short>  right);
       public static Vector128<uint>   MinPairwise(Vector128<uint>   left, Vector128<uint>   right);
       public static Vector128<int>    MinPairwise(Vector128<int>    left, Vector128<int>    right);
       public static Vector128<float>  MinPairwise(Vector128<float>  left, Vector128<float>  right);
       public static Vector128<double> MinPairwise(Vector128<double> left, Vector128<double> right);

       /// <summary>
       /// Vector min across
       ///
       /// result = max(value[0], ... , value[length -1])
       ///
       /// Corresponds to vector forms of ARM64 SMINV, UMINV, and FMINV
       /// </summary>
       public static Vector64<byte>   MinAcross(Vector64<byte>    value);
       public static Vector64<sbyte>  MinAcross(Vector64<sbyte>   value);
       public static Vector64<ushort> MinAcross(Vector64<ushort>  value);
       public static Vector64<short>  MinAcross(Vector64<short>   value);
       public static Vector64<uint>   MinAcross(Vector64<uint>    value);
       public static Vector64<int>    MinAcross(Vector64<int>     value);
       public static Vector64<float>  MinAcross(Vector64<float>   value);
       public static Vector64<byte>   MinAcross(Vector128<byte>   value);
       public static Vector64<sbyte>  MinAcross(Vector128<sbyte>  value);
       public static Vector64<ushort> MinAcross(Vector128<ushort> value);
       public static Vector64<short>  MinAcross(Vector128<short>  value);
       public static Vector64<uint>   MinAcross(Vector128<uint>   value);
       public static Vector64<int>    MinAcross(Vector128<int>    value);
       public static Vector64<float>  MinAcross(Vector128<float>  value);
       public static Vector64<double> MinAcross(Vector128<double> value);

       /// <summary>
       /// Vector fused multiply add
       ///
       /// For each element result[elem] = acc[elem] + left[elem] * right[elem]
       ///
       /// Corresponds to vector forms of ARM64 FMLA
       /// </summary>
       public static Vector128<double> FusedMultiplyAdd(Vector128<double> addend, Vector128<double> left, Vector128<double> right);

       /// <summary>
       /// Vector fused multiply add by element
       ///
       /// For each element result[elem] = acc[elem] + left[elem] * right
       ///
       /// Corresponds to vector forms of ARM64 FMLA
       /// </summary>
       public static Vector64<float>   FusedMultiplyAdd(Vector64<float>  addend, Vector64<float>  left, float right);
       public static Vector128<float>  FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, float right);
       public static Vector128<float>  FusedMultiplyAdd(Vector128<float> addend, Vector128<float> left, float right);

       /// <summary>
       /// Vector fused multiply subtract
       ///
       /// For each element result[elem] = acc[elem] - left[elem] * right[elem]
       ///
       /// Corresponds to vector forms of ARM64 FMLS
       /// </summary>
       public static Vector128<double>  FusedMultiplySubtract(Vector128<double> minuend, Vector128<double>  left, Vector128<double> right);

       /// <summary>
       /// Vector fused multiply subtract by element
       ///
       /// For each element result[elem] = acc[elem] - left[elem] * right
       ///
       /// Corresponds to vector forms of ARM64 FMLS
       /// </summary>
       public static Vector64<float>   FusedMultiplySubtract(Vector64<float>   minuend, Vector64<float>   left, float right);
       public static Vector128<float>  FusedMultiplySubtract(Vector128<float>  minuend, Vector128<float>  left, float right);
       public static Vector128<double> FusedMultiplySubtract(Vector128<double> minuend, Vector128<double> left, float right);

       /// <summary>
       /// Vector multiply extend
       ///
       /// For each element result[elem] = left[elem] * right[elem]
       /// Handle extend special cases zero and infinite.  FMULX
       ///
       /// Corresponds to vector forms of ARM64 FMULX
       /// </summary>
       public static Vector64<float>   MultiplyExtended(Vector64<float>   left, Vector64<float>   right);
       public static Vector128<float>  MultiplyExtended(Vector128<float>  left, Vector128<float>  right);
       public static Vector128<double> MultiplyExtended(Vector128<double> left, Vector128<double> right);

       /// <summary>
       /// Vector multiply extend by element
       ///
       /// For each element result[elem] = left[elem] * right
       /// Handle extend special cases zero and infinite.  FMULX
       ///
       /// Corresponds to vector forms of ARM64 FMULX
       /// </summary>
        public static Vector64<float>   MultiplyExtendedBySelectedScalar(Vector64<float>   left, Vector64<float>   right, byte rightIndex);
        public static Vector128<float>  MultiplyExtendedBySelectedScalar(Vector128<float>  left, Vector128<float>  right, byte rightIndex);
        public static Vector128<double> MultiplyExtendedBySelectedScalar(Vector128<double> left, Vector128<double> right, byte rightIndex);

       /// Vector reciprocal estimate
       ///
       /// See FRECPE docs
       ///
       /// Corresponds to vector forms of ARM64 FRECPE
       /// </summary>
       public static Vector128<double> ReciprocalEstimate(Vector128<double> value);

       /// <summary>
       /// Vector reciprocal step
       ///
       /// See FRECPS docs
       ///
       /// Corresponds to vector forms of ARM64 FRECPS
       /// </summary>
       public static Vector128<double> ReciprocalStep(Vector128<double> left, Vector128<double> right);

       /// <summary>
       /// Vector reciprocal square root estimate
       ///
       /// See FRSQRTE docs
       ///
       /// Corresponds to vector forms of ARM64 FRSQRTE
       /// </summary>
       public static Vector128<double> ReciprocalSquareRootEstimate(Vector128<double> value);

       /// <summary>
       /// Vector reciprocal square root step
       ///
       /// See FRSQRTS docs
       ///
       /// Corresponds to vector forms of ARM64 FRSQRTS
       /// </summary>
       public static Vector128<double> ReciprocalSquareRootStep(Vector128<double> left, Vector128<double> right);

       /// <summary>
       /// Vector reverse byte bits
       /// Corresponds to vector forms of ARM64 RBIT
       /// </summary>
       public static Vector64<byte>    ReverseElementBits(Vector64<byte>    value);
       public static Vector64<sbyte>   ReverseElementBits(Vector64<sbyte>   value);
       public static Vector128<byte>   ReverseElementBits(Vector128<byte>   value);
       public static Vector128<sbyte>  ReverseElementBits(Vector128<sbyte>  value);
    }
}

}
```

terrajobst on 3 Mar 2020

I think we should re-consider API design for ExtractVector64 and ExtractVector128 - instead of
1) forbidding floating point types and
2) specifying a byteIndex

we should follow the approach that C++ intrinsics take - specify elementIndex and have JIT to convert this element index to byte index immediate - this way we are not gonna get de-normalized floating point value as a result and we will be on parity with the C++ implementations.

For example, for ExtractVector64(upper, lower, 1) where upper and lower are Vector64<float> JIT will emit EXT Vd.8B, Vn.8B, Vm.8B, 4

echesakovMSFT on 14 Mar 2020

I don't like the approach of taking an elementIndex. That artificially limits the usage of the intrinsics and prevents you from extracting an arbitrary 64-bit sequence.

There are separate intrinsics for extracting individual elements from a vector: https://github.com/dotnet/runtime/issues/24588 and users wanting to work with float can just use the zero cost reinterpret cast APIs (.AsInt32 and .AsSingle), which will force them to rationalize the denormal scenario and take it into consideration.

tannergooding on 14 Mar 2020

Another question concerning FusedMultiplyAddBySelectedScalar and FusedMultiplySubtractBySelectedScalar.

In C++ there are exist
float32x2_t vfma_lane_f32 (float32x2_t a, float32x2_t b, float32x2_t v, const int lane)
and
float32x2_t vfma_laneq_f32 (float32x2_t a, float32x2_t b, float32x4_t v, const int lane).

Shouldn't we follow the same approach, i.e. have multiple overloads such as :

public static Vector64<float> FusedMultiplyAddBySelectedScalar(Vector64<float> addend, Vector64<float> left, Vector64<float> right, byte rightIndex);

public static Vector64<float> FusedMultiplyAddBySelectedScalar(Vector64<float> addend, Vector64<float> left, Vector128<float> right, byte rightIndex);

Alternatively, we can can have right to be Vector128<T> no matter size of addend and left and upcast Vector64<T> to Vector128<T> if needed

The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.

echesakovMSFT on 14 Mar 2020

I don't like the approach of taking an elementIndex. That artificially limits the usage of the intrinsics and prevents you from extracting an arbitrary 64-bit sequence.

There are separate intrinsics for extracting individual elements from a vector: #24588 and users wanting to work with float can just use the zero cost reinterpret cast APIs (.AsInt32 and .AsSingle), which will force them to rationalize the denormal scenario and take it into consideration.

24588 works only on one SIMD register not a pair of SIMD registers.

If a user wants to extract an arbitrary 8/16-bytes sequence why not convert both operands to Vector64/128 using As<byte>()?

echesakovMSFT on 14 Mar 2020

Shouldn't we follow the same approach, i.e. have multiple overloads such as
The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.

It doesn't look like the underlying instruction encoding requires they all be the same size and so I would guess this is done because C++ doesn't have an easy way to convert from V128<T> to V64<T>.
Maybe @TamarChristinaArm has a better idea of why the split exists?

If a user wants to extract an arbitrary 8/16-bytes sequence why not convert both operands to Vector64/128 using As()?

They could also do that, but the underlying instruction actually operates on byteIndex and we have tended away from adding abstractions of the instructions so far.

tannergooding on 14 Mar 2020

They could also do that, but the underlying instruction actually operates on byteIndex and we have tended away from adding abstractions of the instructions so far.

Well, this is true on Arm64 for EXT <Vd>.<T>, <Vn>.<T>, <Vm>.<T>, #<index> where
<index> is indeed a byte index. By the way, <T> can only be 8B or 16B that kind of suggests you are working on byte sequences.

However, on Arm32 VEXT (multibyte elements) VEXT.<size> {<Dd>,} <Dn>, <Dm>, #<imm> is a pseudo-instruction that translates by assembler to VEXT (byte elements) VEXT.8 {<Dd>,} <Dn>, <Dm>, #<imm*(size/8)>, i.e. #<imm> is an element index.

echesakovMSFT on 14 Mar 2020

Both instructions (VEXT on ARM32 and EXT on ARM64) operate identically. They are similar to orr or other logical operations. That is, the instruction encoding only takes 8B/16B but it isn't doing something that is logically byte only and will be frequently used for non byte operations.

We can always create an issue and re-discuss ExtractVector64/ExtractVector128 again on Tuesday, bringing up the C++ difference and whether having it operate on element by default is better (with requesting users to downcast if they want byte sequences instead).

tannergooding on 14 Mar 2020

Shouldn't we follow the same approach, i.e. have multiple overloads such as
The current design, however, when we match the sizes of left,right and addend doesn't seem to be practical.

It doesn't look like the underlying instruction encoding requires they all be the same size and so I would guess this is done because C++ doesn't have an easy way to convert from V128<T> to V64<T>.
Maybe @TamarChristinaArm has a better idea of why the split exists?

For such intrinsics the split is always 4 ways. the location of the q in the name denotes which components are 128 bits.

So for e.g. the float case we have

vfmaq_lane_f32
vfma_laneq_f32
vfmaq_laneq_f32

precisely because as you said the instruction doesn't require them to all be the same size.

This convention (partially) holds for newer ISAs such as MVE[1] and SVE[2] as well, though in those cases we also have completely overload driven instances as well. e.g. svmla_lane for SVE will do the normal overloading you would expect in C++ (and in C using C11's _Generic extension).

We do have a way to convert from V128<T> to V64<T> i.e. vget_low but they're not zero cost abstractions. So we prefer to provide the overloads.

[1] https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
[2] https://static.docs.arm.com/100987/0000/acle_sve_100987_0000_00_en.pdf

TamarChristinaArm on 16 Mar 2020

👍1

The following APIs are still to be implemented:

namespace System.Runtime.Intrinsics.Arm
{
    public static class AdvSimd
    {
        /// <summary>
        /// Vector extract from pair of vectors
        /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
        ///
        /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
        ///
        /// Corresponds to vector forms of ARM64 EXT
        /// </summary>
        public static Vector64<byte>   ExtractVector64(Vector64<byte>  upper, Vector64<byte>  lower, byte byteIndex);
        public static Vector64<sbyte>  ExtractVector64(Vector64<sbyte>  upper, Vector64<sbyte>  lower, byte byteIndex);
        public static Vector64<short>  ExtractVector64(Vector64<short>  upper, Vector64<short>  lower, byte byteIndex);
        public static Vector64<ushort> ExtractVector64(Vector64<ushort>  upper, Vector64<ushort>  lower, byte byteIndex);
        public static Vector64<int>    ExtractVector64(Vector64<int>  upper, Vector64<int>  lower, byte byteIndex);
        public static Vector64<uint>   ExtractVector64(Vector64<uint>  upper, Vector64<uint>  lower, byte byteIndex);

        public static Vector128<byte>   ExtractVector128(Vector128<byte> upper, Vector128<byte> lower, byte byteIndex);
        public static Vector128<sbyte>  ExtractVector128(Vector128<sbyte> upper, Vector128<sbyte> lower, byte byteIndex);
        public static Vector128<short>  ExtractVector128(Vector128<short> upper, Vector128<short> lower, byte byteIndex);
        public static Vector128<ushort> ExtractVector128(Vector128<ushort> upper, Vector128<ushort> lower, byte byteIndex);
        public static Vector128<int>    ExtractVector128(Vector128<int> upper, Vector128<int> lower, byte byteIndex);
        public static Vector128<uint>   ExtractVector128(Vector128<uint> upper, Vector128<uint> lower, byte byteIndex);
        public static Vector128<long>   ExtractVector128(Vector128<long> upper, Vector128<long> lower, byte byteIndex);
        public static Vector128<ulong>  ExtractVector128(Vector128<ulong> upper, Vector128<ulong> lower, byte byteIndex);
        public static Vector128<float>  ExtractVector128(Vector128<float> upper, Vector128<float> lower, byte byteIndex);
        public static Vector128<double> ExtractVector128(Vector128<double> upper, Vector128<double> lower, byte byteIndex);

        /// <summary>
        /// Vector multiply add by element
        ///
        /// For each element result[elem] = acc[elem] + left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLA
        /// </summary>
        public static Vector64<byte>    MultiplyAddBySelectedScalar(Vector64<byte>    addend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplyAddBySelectedScalar(Vector64<sbyte>   addend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplyAddBySelectedScalar(Vector64<ushort>  addend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplyAddBySelectedScalar(Vector64<short>   addend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplyAddBySelectedScalar(Vector64<uint>    addend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplyAddBySelectedScalar(Vector64<int>     addend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplyAddBySelectedScalar(Vector128<byte>   addend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplyAddBySelectedScalar(Vector128<sbyte>  addend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplyAddBySelectedScalar(Vector128<ushort> addend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplyAddBySelectedScalar(Vector128<short>  addend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplyAddBySelectedScalar(Vector128<uint>   addend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplyAddBySelectedScalar(Vector128<int>    addend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        /// <summary>
        /// Vector multiply subtract by element
        ///
        /// For each element result[elem] = acc[elem] - left[elem] * right
        ///
        /// Corresponds to vector forms of ARM64 MLS
        /// </summary>
        public static Vector64<byte>    MultiplySubtractBySelectedScalar(Vector64<byte>    minuend, Vector64<byte>    left, Vector64<byte>    right, byte rightIndex);
        public static Vector64<sbyte>   MultiplySubtractBySelectedScalar(Vector64<sbyte>   minuend, Vector64<sbyte>   left, Vector64<sbyte>   right, byte rightIndex);
        public static Vector64<ushort>  MultiplySubtractBySelectedScalar(Vector64<ushort>  minuend, Vector64<ushort>  left, Vector64<ushort>  right, byte rightIndex);
        public static Vector64<short>   MultiplySubtractBySelectedScalar(Vector64<short>   minuend, Vector64<short>   left, Vector64<short>   right, byte rightIndex);
        public static Vector64<uint>    MultiplySubtractBySelectedScalar(Vector64<uint>    minuend, Vector64<uint>    left, Vector64<uint>    right, byte rightIndex);
        public static Vector64<int>     MultiplySubtractBySelectedScalar(Vector64<int>     minuend, Vector64<int>     left, Vector64<int>     right, byte rightIndex);
        public static Vector128<byte>   MultiplySubtractBySelectedScalar(Vector128<byte>   minuend, Vector128<byte>   left, Vector128<byte>   right, byte rightIndex);
        public static Vector128<sbyte>  MultiplySubtractBySelectedScalar(Vector128<sbyte>  minuend, Vector128<sbyte>  left, Vector128<sbyte>  right, byte rightIndex);
        public static Vector128<ushort> MultiplySubtractBySelectedScalar(Vector128<ushort> minuend, Vector128<ushort> left, Vector128<ushort> right, byte rightIndex);
        public static Vector128<short>  MultiplySubtractBySelectedScalar(Vector128<short>  minuend, Vector128<short>  left, Vector128<short>  right, byte rightIndex);
        public static Vector128<uint>   MultiplySubtractBySelectedScalar(Vector128<uint>   minuend, Vector128<uint>   left, Vector128<uint>   right, byte rightIndex);
        public static Vector128<int>    MultiplySubtractBySelectedScalar(Vector128<int>    minuend, Vector128<int>    left, Vector128<int>    right, byte rightIndex);

        public static class Arm64
        {
            /// <summary>
            /// Vector extract from pair of vectors
            /// For each byte result[byte] = byte + index < result.Length ? left[byte + index] : right[byte + index - result.Length]
            ///
            /// Note: index must be a JIT time const expression which can be used to populate the literal immediate field
            ///
            /// Corresponds to vector forms of ARM64 EXT
            /// </summary>
            public static Vector128<double> ExtractVector128(Vector128<double> left, Vector128<double> right, byte index);

            /// <summary>
            /// Vector multiply extend by element
            ///
            /// For each element result[elem] = left[elem] * right
            /// Handle extend special cases zero and infinite.  FMULX
            ///
            /// Corresponds to vector forms of ARM64 FMULX
            /// </summary>
            public static Vector64<float>   MultiplyExtendedBySelectedScalar(Vector64<float>   left, Vector64<float>   right, byte rightIndex);
            public static Vector128<float>  MultiplyExtendedBySelectedScalar(Vector128<float>  left, Vector128<float>  right, byte rightIndex);
            public static Vector128<double> MultiplyExtendedBySelectedScalar(Vector128<double> left, Vector128<double> right, byte rightIndex);
        }
    }
}

The following APIs still need to be investigated and brought back for review (I will open a new issue for them):

namespace System.Runtime.Intrinsics.Arm
{
    public static class AdvSimd
    {
        /// <summary>
        /// Vector reverse element bytes
        /// Corresponds to vector forms of ARM64 REV16, REV32, REV64
        /// </summary>
        public static Vector64<ushort>  ReverseElementBytes(Vector64<ushort>  value) { throw null; }
        public static Vector64<short>   ReverseElementBytes(Vector64<short>   value) { throw null; }
        public static Vector64<uint>    ReverseElementBytes(Vector64<uint>    value) { throw null; }
        public static Vector64<int>     ReverseElementBytes(Vector64<int>     value) { throw null; }
        public static Vector64<float>   ReverseElementBytes(Vector64<float>   value) { throw null; }
        public static Vector128<ushort> ReverseElementBytes(Vector128<ushort> value) { throw null; }
        public static Vector128<short>  ReverseElementBytes(Vector128<short>  value) { throw null; }
        public static Vector128<uint>   ReverseElementBytes(Vector128<uint>   value) { throw null; }
        public static Vector128<int>    ReverseElementBytes(Vector128<int>    value) { throw null; }
        public static Vector128<ulong>  ReverseElementBytes(Vector128<ulong>  value) { throw null; }
        public static Vector128<long>   ReverseElementBytes(Vector128<long>   value) { throw null; }
        public static Vector128<float>  ReverseElementBytes(Vector128<float>  value) { throw null; }
    }
}

tannergooding on 16 Apr 2020

The following APIs are still to be implemented:

@tannergooding Also Fused_MLA/MLS_BySelectedScalar

And we need to add/propose MultiplyBySelectedScalar - mul also has by element form.

echesakovMSFT on 16 Apr 2020

Also Fused_MLA/MLS_BySelectedScalar
And we need to add/propose MultiplyBySelectedScalar - mul also has by element form.

The former haven't been proposed yet either (I don't see them listed anywhere above). I'm adding them to https://github.com/dotnet/runtime/issues/33683

tannergooding on 16 Apr 2020

Was this page helpful?

0 / 5 - 0 ratings