SAP
diff --git a/‎src/hotspot/cpu/aarch64/aarch64_vector.ad‎
Lines changed: 82 additions & 8 deletions b/‎src/hotspot/cpu/aarch64/aarch64_vector.ad‎
Lines changed: 82 additions & 8 deletions
diff --git a/‎src/hotspot/cpu/aarch64/aarch64_vector_ad.m4‎
Lines changed: 106 additions & 35 deletions b/‎src/hotspot/cpu/aarch64/aarch64_vector_ad.m4‎
Lines changed: 106 additions & 35 deletions
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
-// Copyright (c) 2020, 2025, Arm Limited. All rights reserved.
+// Copyright (c) 2020, 2026, Arm Limited. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -247,10 +247,39 @@ source %{
       case Op_MinVHF:
       case Op_MaxVHF:
       case Op_SqrtVHF:
+        if (UseSVE == 0 && !is_feat_fp16_supported()) {
+          return false;
+        }
+        break;
+      // At the time of writing this, the Vector API has no half-float (FP16) species.
+      // Consequently, AddReductionVHF and MulReductionVHF are only produced by the
+      // auto-vectorizer, which requires strictly ordered semantics for FP reductions.
+      //
+      // There is no direct Neon instruction that performs strictly ordered floating
+      // point add reduction. Hence, on Neon only machines, the add reduction operation
+      // is implemented as a scalarized sequence using half-precision scalar instruction
+      // FADD which requires FEAT_FP16 and ASIMDHP to be available on the target.
+      // On SVE machines (UseSVE > 0) however, there is a direct instruction (FADDA) which
+      // implements strictly ordered floating point add reduction which does not require
+      // the FEAT_FP16 and ASIMDHP checks as SVE supports half-precision floats by default.
+      case Op_AddReductionVHF:
         // FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
         // Only the Neon instructions need this check. SVE supports half-precision floats
         // by default.
-        if (UseSVE == 0 && !is_feat_fp16_supported()) {
+        if (length_in_bytes < 8 || (UseSVE == 0 && !is_feat_fp16_supported())) {
+          return false;
+        }
+        break;
+      case Op_MulReductionVHF:
+        // There are no direct Neon/SVE instructions that perform strictly ordered
+        // floating point multiply reduction.
+        // For vector length ≤ 16 bytes, the reduction is implemented as a scalarized
+        // sequence using half-precision scalar instruction FMUL. This path requires
+        // FEAT_FP16 and ASIMDHP to be available on the target.
+        // For vector length > 16 bytes, this operation is disabled because there is no
+        // direct SVE instruction that performs a strictly ordered FP16 multiply
+        // reduction.
+        if (length_in_bytes < 8 || length_in_bytes > 16 || !is_feat_fp16_supported()) {
           return false;
         }
         break;
@@ -300,6 +329,7 @@ source %{
       case Op_VectorRearrange:
       case Op_MulReductionVD:
       case Op_MulReductionVF:
+      case Op_MulReductionVHF:
       case Op_MulReductionVI:
       case Op_MulReductionVL:
       case Op_CompressBitsV:
@@ -364,6 +394,7 @@ source %{
       case Op_VectorMaskCmp:
       case Op_LoadVectorGather:
       case Op_StoreVectorScatter:
+      case Op_AddReductionVHF:
       case Op_AddReductionVF:
       case Op_AddReductionVD:
       case Op_AndReductionV:
@@ -3402,6 +3433,44 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
   ins_pipe(pipe_slow);
 %}
 
+// Add Reduction for Half floats (FP16).
+// Neon does not provide direct instructions for strictly ordered floating-point add reductions.
+// On Neon-only targets (UseSVE = 0), this operation is implemented as a sequence of scalar additions:
+// values equal to the vector width are loaded into a vector register, each lane is extracted,
+// and its value is accumulated into the running sum, producing a final scalar result.
+instruct reduce_addHF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+  predicate(UseSVE == 0);
+  match(Set dst (AddReductionVHF fsrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp);
+  format %{ "reduce_addHF $dst, $fsrc, $vsrc\t# 4HF/8HF. KILL $tmp" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    __ neon_reduce_add_fp16($dst$$FloatRegister, $fsrc$$FloatRegister,
+                            $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This rule calculates the reduction result in strict order. Two cases will
+// reach here:
+// 1. Non strictly-ordered AddReductionVHF when vector size > 128-bits. For example -
+//    AddReductionVHF generated by Vector API. For vector size > 128-bits, it is more
+//    beneficial performance-wise to generate direct SVE instruction even if it is
+//    strictly ordered.
+// 2. Strictly-ordered AddReductionVHF. For example - AddReductionVHF generated by
+//    auto-vectorization on SVE machine.
+instruct reduce_addHF_sve(vRegF dst_src1, vReg src2) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src1 (AddReductionVHF dst_src1 src2));
+  format %{ "reduce_addHF_sve $dst_src1, $dst_src1, $src2" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ sve_fadda($dst_src1$$FloatRegister, __ H, ptrue, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // This rule calculates the reduction result in strict order. Two cases will
 // reach here:
 // 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
@@ -3492,12 +3561,14 @@ instruct reduce_addL_masked(iRegLNoSp dst, iRegL isrc, vReg vsrc, pRegGov pg, vR
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_addF_masked(vRegF dst_src1, vReg src2, pRegGov pg) %{
+instruct reduce_addFHF_masked(vRegF dst_src1, vReg src2, pRegGov pg) %{
   predicate(UseSVE > 0);
+  match(Set dst_src1 (AddReductionVHF (Binary dst_src1 src2) pg));
   match(Set dst_src1 (AddReductionVF (Binary dst_src1 src2) pg));
-  format %{ "reduce_addF_masked $dst_src1, $pg, $dst_src1, $src2" %}
+  format %{ "reduce_addFHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
   ins_encode %{
-    __ sve_fadda($dst_src1$$FloatRegister, __ S,
+    BasicType bt = Matcher::vector_element_basic_type(this, $src2);
+    __ sve_fadda($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                  $pg$$PRegister, $src2$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
@@ -3545,14 +3616,17 @@ instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+
+instruct reduce_mulFHF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
+  match(Set dst (MulReductionVHF fsrc vsrc));
   match(Set dst (MulReductionVF fsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
+  format %{ "reduce_mulFHF $dst, $fsrc, $vsrc\t# 2F/4F/4HF/8HF. KILL $tmp" %}
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
-    __ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
+    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
+    __ neon_reduce_mul_fp($dst$$FloatRegister, bt, $fsrc$$FloatRegister,
                           $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
-// Copyright (c) 2020, 2025, Arm Limited. All rights reserved.
+// Copyright (c) 2020, 2026, Arm Limited. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -237,10 +237,39 @@ source %{
       case Op_MinVHF:
       case Op_MaxVHF:
       case Op_SqrtVHF:
+        if (UseSVE == 0 && !is_feat_fp16_supported()) {
+          return false;
+        }
+        break;
+      // At the time of writing this, the Vector API has no half-float (FP16) species.
+      // Consequently, AddReductionVHF and MulReductionVHF are only produced by the
+      // auto-vectorizer, which requires strictly ordered semantics for FP reductions.
+      //
+      // There is no direct Neon instruction that performs strictly ordered floating
+      // point add reduction. Hence, on Neon only machines, the add reduction operation
+      // is implemented as a scalarized sequence using half-precision scalar instruction
+      // FADD which requires FEAT_FP16 and ASIMDHP to be available on the target.
+      // On SVE machines (UseSVE > 0) however, there is a direct instruction (FADDA) which
+      // implements strictly ordered floating point add reduction which does not require
+      // the FEAT_FP16 and ASIMDHP checks as SVE supports half-precision floats by default.
+      case Op_AddReductionVHF:
         // FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
         // Only the Neon instructions need this check. SVE supports half-precision floats
         // by default.
-        if (UseSVE == 0 && !is_feat_fp16_supported()) {
+        if (length_in_bytes < 8 || (UseSVE == 0 && !is_feat_fp16_supported())) {
+          return false;
+        }
+        break;
+      case Op_MulReductionVHF:
+        // There are no direct Neon/SVE instructions that perform strictly ordered
+        // floating point multiply reduction.
+        // For vector length ≤ 16 bytes, the reduction is implemented as a scalarized
+        // sequence using half-precision scalar instruction FMUL. This path requires
+        // FEAT_FP16 and ASIMDHP to be available on the target.
+        // For vector length > 16 bytes, this operation is disabled because there is no
+        // direct SVE instruction that performs a strictly ordered FP16 multiply
+        // reduction.
+        if (length_in_bytes < 8 || length_in_bytes > 16 || !is_feat_fp16_supported()) {
           return false;
         }
         break;
@@ -290,6 +319,7 @@ source %{
       case Op_VectorRearrange:
       case Op_MulReductionVD:
       case Op_MulReductionVF:
+      case Op_MulReductionVHF:
       case Op_MulReductionVI:
       case Op_MulReductionVL:
       case Op_CompressBitsV:
@@ -354,6 +384,7 @@ source %{
       case Op_VectorMaskCmp:
       case Op_LoadVectorGather:
       case Op_StoreVectorScatter:
+      case Op_AddReductionVHF:
       case Op_AddReductionVF:
       case Op_AddReductionVD:
       case Op_AndReductionV:
@@ -2063,6 +2094,25 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
   ins_pipe(pipe_slow);
 %}
 dnl
+
+// Add Reduction for Half floats (FP16).
+// Neon does not provide direct instructions for strictly ordered floating-point add reductions.
+// On Neon-only targets (UseSVE = 0), this operation is implemented as a sequence of scalar additions:
+// values equal to the vector width are loaded into a vector register, each lane is extracted,
+// and its value is accumulated into the running sum, producing a final scalar result.
+instruct reduce_addHF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+  predicate(UseSVE == 0);
+  match(Set dst (AddReductionVHF fsrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp);
+  format %{ "reduce_addHF $dst, $fsrc, $vsrc\t# 4HF/8HF. KILL $tmp" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    __ neon_reduce_add_fp16($dst$$FloatRegister, $fsrc$$FloatRegister,
+                            $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+dnl
 dnl REDUCE_ADD_FP_SVE($1,   $2  )
 dnl REDUCE_ADD_FP_SVE(type, size)
 define(`REDUCE_ADD_FP_SVE', `
@@ -2074,21 +2124,26 @@ define(`REDUCE_ADD_FP_SVE', `
 //    strictly ordered.
 // 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
 //    auto-vectorization on SVE machine.
-instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
-            n->as_Reduction()->requires_strict_order());
+instruct reduce_add$1_sve(vReg`'ifelse($1, HF, F, $1) dst_src1, vReg src2) %{
+  ifelse($1, HF,
+       `predicate(UseSVE > 0);',
+       `predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+            n->as_Reduction()->requires_strict_order());')
   match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
   format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
-    assert(UseSVE > 0, "must be sve");
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
+    ifelse($1, HF, `',
+       `assert(UseSVE > 0, "must be sve");
+    ')dnl
+uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}')dnl
 dnl
-REDUCE_ADD_FP_SVE(F, S)
+REDUCE_ADD_FP_SVE(HF, H)
+REDUCE_ADD_FP_SVE(F,  S)
 
 // reduction addD
 
@@ -2129,21 +2184,30 @@ dnl
 dnl REDUCE_ADD_FP_PREDICATE($1,        $2     )
 dnl REDUCE_ADD_FP_PREDICATE(insn_name, op_name)
 define(`REDUCE_ADD_FP_PREDICATE', `
-instruct reduce_add$1_masked(vReg$1 dst_src1, vReg src2, pRegGov pg) %{
+instruct reduce_add$1_masked(vReg$2 dst_src1, vReg src2, pRegGov pg) %{
   predicate(UseSVE > 0);
-  match(Set dst_src1 (AddReductionV$1 (Binary dst_src1 src2) pg));
+  ifelse($2, F,
+       `match(Set dst_src1 (AddReductionVHF (Binary dst_src1 src2) pg));
+  match(Set dst_src1 (AddReductionV$2 (Binary dst_src1 src2) pg));',
+       `match(Set dst_src1 (AddReductionV$2 (Binary dst_src1 src2) pg));')
   format %{ "reduce_add$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
   ins_encode %{
-    __ sve_fadda($dst_src1$$FloatRegister, __ $2,
-                 $pg$$PRegister, $src2$$FloatRegister);
+    ifelse($2, F,
+       `BasicType bt = Matcher::vector_element_basic_type(this, $src2);
+    ',)dnl
+ifelse($2, F,
+       `__ sve_fadda($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
+                 $pg$$PRegister, $src2$$FloatRegister);',
+       `__ sve_fadda($dst_src1$$FloatRegister, __ $2,
+                 $pg$$PRegister, $src2$$FloatRegister);')
   %}
   ins_pipe(pipe_slow);
 %}')dnl
 dnl
 REDUCE_ADD_INT_PREDICATE(I, iRegIorL2I)
 REDUCE_ADD_INT_PREDICATE(L, iRegL)
-REDUCE_ADD_FP_PREDICATE(F, S)
-REDUCE_ADD_FP_PREDICATE(D, D)
+REDUCE_ADD_FP_PREDICATE(FHF, F)
+REDUCE_ADD_FP_PREDICATE(D,   D)
 
 // ------------------------------ Vector reduction mul -------------------------
 
@@ -2176,30 +2240,37 @@ instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
-  predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
-  match(Set dst (MulReductionVF fsrc vsrc));
+dnl REDUCE_MUL_FP($1,        $2     )
+dnl REDUCE_MUL_FP(insn_name, op_name)
+define(`REDUCE_MUL_FP', `
+instruct reduce_mul$1(vReg$2 dst, vReg$2 ifelse($2, F, fsrc, dsrc), vReg vsrc, vReg tmp) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) ifelse($2, F, <=, ==) 16);
+  ifelse($2, F,
+       `match(Set dst (MulReductionVHF fsrc vsrc));
+  match(Set dst (MulReductionV$2 fsrc vsrc));',
+       `match(Set dst (MulReductionV$2 dsrc vsrc));')
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
-  ins_encode %{
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
-    __ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
-                          $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
+  ifelse($2, F,
+         `format %{ "reduce_mul$1 $dst, $fsrc, $vsrc\t# 2F/4F/4HF/8HF. KILL $tmp" %}',
+         `format %{ "reduce_mul$1 $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}')
+  ins_encode %{
+    ifelse($2, F,
+       `uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    ',)dnl
+ifelse($2, F,
+       `BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
+    ',)dnl
+ifelse($2, F,
+       `__ neon_reduce_mul_fp($dst$$FloatRegister, bt, $fsrc$$FloatRegister,
+                          $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);',
+       `__ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
+                          $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);')
   %}
   ins_pipe(pipe_slow);
-%}
-
-instruct reduce_mulD(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
-  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
-  match(Set dst (MulReductionVD dsrc vsrc));
-  effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_mulD $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
-  ins_encode %{
-    __ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
-                          $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
-  %}
-  ins_pipe(pipe_slow);
-%}
+%}')dnl
+dnl
+REDUCE_MUL_FP(FHF, F)
+REDUCE_MUL_FP(D,   D)
 
 dnl
 dnl REDUCE_BITWISE_OP_NEON($1,        $2       $3    $4     )