11//
22// Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3- // Copyright (c) 2020, 2025 , Arm Limited. All rights reserved.
3+ // Copyright (c) 2020, 2026 , Arm Limited. All rights reserved.
44// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55//
66// This code is free software; you can redistribute it and/or modify it
@@ -237,10 +237,39 @@ source %{
237237 case Op_MinVHF:
238238 case Op_MaxVHF:
239239 case Op_SqrtVHF:
240+ if (UseSVE == 0 && !is_feat_fp16_supported()) {
241+ return false;
242+ }
243+ break;
244+ // At the time of writing this, the Vector API has no half-float (FP16) species.
245+ // Consequently, AddReductionVHF and MulReductionVHF are only produced by the
246+ // auto-vectorizer, which requires strictly ordered semantics for FP reductions.
247+ //
248+ // There is no direct Neon instruction that performs strictly ordered floating
249+ // point add reduction. Hence, on Neon only machines, the add reduction operation
250+ // is implemented as a scalarized sequence using half-precision scalar instruction
251+ // FADD which requires FEAT_FP16 and ASIMDHP to be available on the target.
252+ // On SVE machines (UseSVE > 0) however, there is a direct instruction (FADDA) which
253+ // implements strictly ordered floating point add reduction which does not require
254+ // the FEAT_FP16 and ASIMDHP checks as SVE supports half-precision floats by default.
255+ case Op_AddReductionVHF:
240256 // FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
241257 // Only the Neon instructions need this check. SVE supports half-precision floats
242258 // by default.
243- if (UseSVE == 0 && !is_feat_fp16_supported()) {
259+ if (length_in_bytes < 8 || (UseSVE == 0 && !is_feat_fp16_supported())) {
260+ return false;
261+ }
262+ break;
263+ case Op_MulReductionVHF:
264+ // There are no direct Neon/SVE instructions that perform strictly ordered
265+ // floating point multiply reduction.
266+ // For vector length ≤ 16 bytes, the reduction is implemented as a scalarized
267+ // sequence using half-precision scalar instruction FMUL. This path requires
268+ // FEAT_FP16 and ASIMDHP to be available on the target.
269+ // For vector length > 16 bytes, this operation is disabled because there is no
270+ // direct SVE instruction that performs a strictly ordered FP16 multiply
271+ // reduction.
272+ if (length_in_bytes < 8 || length_in_bytes > 16 || !is_feat_fp16_supported()) {
244273 return false;
245274 }
246275 break;
@@ -290,6 +319,7 @@ source %{
290319 case Op_VectorRearrange:
291320 case Op_MulReductionVD:
292321 case Op_MulReductionVF:
322+ case Op_MulReductionVHF:
293323 case Op_MulReductionVI:
294324 case Op_MulReductionVL:
295325 case Op_CompressBitsV:
@@ -354,6 +384,7 @@ source %{
354384 case Op_VectorMaskCmp:
355385 case Op_LoadVectorGather:
356386 case Op_StoreVectorScatter:
387+ case Op_AddReductionVHF:
357388 case Op_AddReductionVF:
358389 case Op_AddReductionVD:
359390 case Op_AndReductionV:
@@ -2063,6 +2094,25 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
20632094 ins_pipe(pipe_slow);
20642095%}
20652096dnl
2097+
2098+ // Add Reduction for Half floats (FP16).
2099+ // Neon does not provide direct instructions for strictly ordered floating-point add reductions.
2100+ // On Neon-only targets (UseSVE = 0), this operation is implemented as a sequence of scalar additions:
2101+ // values equal to the vector width are loaded into a vector register, each lane is extracted,
2102+ // and its value is accumulated into the running sum, producing a final scalar result.
2103+ instruct reduce_addHF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2104+ predicate(UseSVE == 0);
2105+ match(Set dst (AddReductionVHF fsrc vsrc));
2106+ effect(TEMP_DEF dst, TEMP tmp);
2107+ format %{ "reduce_addHF $dst, $fsrc, $vsrc\t# 4HF/8HF. KILL $tmp" %}
2108+ ins_encode %{
2109+ uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
2110+ __ neon_reduce_add_fp16($dst$$FloatRegister, $fsrc$$FloatRegister,
2111+ $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
2112+ %}
2113+ ins_pipe(pipe_slow);
2114+ %}
2115+ dnl
20662116dnl REDUCE_ADD_FP_SVE($1, $2 )
20672117dnl REDUCE_ADD_FP_SVE(type, size)
20682118define ( `REDUCE_ADD_FP_SVE' , `
@@ -2074,21 +2124,26 @@ define(`REDUCE_ADD_FP_SVE', `
20742124// strictly ordered.
20752125// 2. Strictly-ordered AddReductionV$1 . For example - AddReductionV$1 generated by
20762126// auto-vectorization on SVE machine.
2077- instruct reduce_add$1 _sve ( vReg$1 dst_src1 , vReg src2 ) %{
2078- predicate ( !VM_Version::use_neon_for_vector ( Matcher::vector_length_in_bytes ( n->in ( 2 ))) ||
2079- n->as_Reduction ( ) - >requires_strict_order ( )) ;
2127+ instruct reduce_add$1 _sve ( vReg`'ifelse ( $1 , HF , F , $1 ) dst_src1 , vReg src2 ) %{
2128+ ifelse ( $1 , HF ,
2129+ `predicate ( UseSVE > 0 ) ;' ,
2130+ `predicate ( !VM_Version::use_neon_for_vector ( Matcher::vector_length_in_bytes ( n->in ( 2 ))) ||
2131+ n->as_Reduction ( ) - >requires_strict_order ( )) ;' )
20802132 match ( Set dst_src1 ( AddReductionV$1 dst_src1 src2 )) ;
20812133 format %{ "reduce_add$1 _sve $dst_src1 , $dst_src1 , $src2" %}
20822134 ins_encode %{
2083- assert ( UseSVE > 0 , "must be sve" ) ;
2084- uint length_in_bytes = Matcher::vector_length_in_bytes ( this , $src2 ) ;
2135+ ifelse ( $1 , HF , `' ,
2136+ `assert ( UseSVE > 0 , "must be sve" ) ;
2137+ ' ) dnl
2138+ uint length_in_bytes = Matcher::vector_length_in_bytes ( this , $src2 ) ;
20852139 assert ( length_in_bytes == MaxVectorSize , "invalid vector length" ) ;
20862140 __ sve_fadda ( $dst_src1$$FloatRegister , __ $2 , ptrue , $src2$$FloatRegister ) ;
20872141 %}
20882142 ins_pipe ( pipe_slow ) ;
20892143%}' ) dnl
20902144dnl
2091- REDUCE_ADD_FP_SVE(F, S)
2145+ REDUCE_ADD_FP_SVE(HF, H)
2146+ REDUCE_ADD_FP_SVE(F, S)
20922147
20932148// reduction addD
20942149
@@ -2129,21 +2184,30 @@ dnl
21292184dnl REDUCE_ADD_FP_PREDICATE($1, $2 )
21302185dnl REDUCE_ADD_FP_PREDICATE(insn_name, op_name)
21312186define ( `REDUCE_ADD_FP_PREDICATE' , `
2132- instruct reduce_add$1 _masked ( vReg$1 dst_src1 , vReg src2 , pRegGov pg ) %{
2187+ instruct reduce_add$1 _masked ( vReg$2 dst_src1 , vReg src2 , pRegGov pg ) %{
21332188 predicate ( UseSVE > 0 ) ;
2134- match ( Set dst_src1 ( AddReductionV$1 ( Binary dst_src1 src2 ) pg )) ;
2189+ ifelse ( $2 , F ,
2190+ `match ( Set dst_src1 ( AddReductionVHF ( Binary dst_src1 src2 ) pg )) ;
2191+ match ( Set dst_src1 ( AddReductionV$2 ( Binary dst_src1 src2 ) pg )) ;' ,
2192+ `match ( Set dst_src1 ( AddReductionV$2 ( Binary dst_src1 src2 ) pg )) ;' )
21352193 format %{ "reduce_add$1 _masked $dst_src1 , $pg , $dst_src1 , $src2" %}
21362194 ins_encode %{
2137- __ sve_fadda ( $dst_src1$$FloatRegister , __ $2 ,
2138- $pg$$PRegister , $src2$$FloatRegister ) ;
2195+ ifelse ( $2 , F ,
2196+ `BasicType bt = Matcher::vector_element_basic_type ( this , $src2 ) ;
2197+ ' ,) dnl
2198+ ifelse ( $2 , F ,
2199+ `__ sve_fadda ( $dst_src1$$FloatRegister , __ elemType_to_regVariant ( bt ) ,
2200+ $pg$$PRegister , $src2$$FloatRegister ) ;' ,
2201+ `__ sve_fadda ( $dst_src1$$FloatRegister , __ $2 ,
2202+ $pg$$PRegister , $src2$$FloatRegister ) ;' )
21392203 %}
21402204 ins_pipe ( pipe_slow ) ;
21412205%}' ) dnl
21422206dnl
21432207REDUCE_ADD_INT_PREDICATE(I, iRegIorL2I)
21442208REDUCE_ADD_INT_PREDICATE(L, iRegL)
2145- REDUCE_ADD_FP_PREDICATE(F, S )
2146- REDUCE_ADD_FP_PREDICATE(D, D)
2209+ REDUCE_ADD_FP_PREDICATE(FHF, F )
2210+ REDUCE_ADD_FP_PREDICATE(D, D)
21472211
21482212// ------------------------------ Vector reduction mul -------------------------
21492213
@@ -2176,30 +2240,37 @@ instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
21762240 ins_pipe(pipe_slow);
21772241%}
21782242
2179- instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2180- predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
2181- match(Set dst (MulReductionVF fsrc vsrc));
2243+ dnl REDUCE_MUL_FP($1, $2 )
2244+ dnl REDUCE_MUL_FP(insn_name, op_name)
2245+ define ( `REDUCE_MUL_FP' , `
2246+ instruct reduce_mul$1 ( vReg$2 dst , vReg$2 ifelse ( $2 , F , fsrc , dsrc ) , vReg vsrc , vReg tmp ) %{
2247+ predicate ( Matcher::vector_length_in_bytes ( n->in ( 2 )) ifelse ( $2 , F , <= , == ) 16 ) ;
2248+ ifelse ( $2 , F ,
2249+ `match ( Set dst ( MulReductionVHF fsrc vsrc )) ;
2250+ match ( Set dst ( MulReductionV$2 fsrc vsrc )) ;' ,
2251+ `match ( Set dst ( MulReductionV$2 dsrc vsrc )) ;' )
21822252 effect ( TEMP_DEF dst , TEMP tmp ) ;
2183- format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
2184- ins_encode %{
2185- uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
2186- __ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
2187- $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
2253+ ifelse ( $2 , F ,
2254+ `format %{ "reduce_mul$1 $dst , $fsrc , $vsrc\t# 2F/4F/4HF/8HF. KILL $tmp" %}' ,
2255+ `format %{ "reduce_mul$1 $dst , $dsrc , $vsrc\t# 2D. KILL $tmp" %}' )
2256+ ins_encode %{
2257+ ifelse ( $2 , F ,
2258+ `uint length_in_bytes = Matcher::vector_length_in_bytes ( this , $vsrc ) ;
2259+ ' ,) dnl
2260+ ifelse ( $2 , F ,
2261+ `BasicType bt = Matcher::vector_element_basic_type ( this , $vsrc ) ;
2262+ ' ,) dnl
2263+ ifelse ( $2 , F ,
2264+ `__ neon_reduce_mul_fp ( $dst$$FloatRegister , bt , $fsrc$$FloatRegister ,
2265+ $vsrc$$FloatRegister , length_in_bytes , $tmp$$FloatRegister ) ;' ,
2266+ `__ neon_reduce_mul_fp ( $dst$$FloatRegister , T_DOUBLE , $dsrc$$FloatRegister ,
2267+ $vsrc$$FloatRegister , 16 , $tmp$$FloatRegister ) ;' )
21882268 %}
21892269 ins_pipe ( pipe_slow ) ;
2190- %}
2191-
2192- instruct reduce_mulD(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
2193- predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
2194- match(Set dst (MulReductionVD dsrc vsrc));
2195- effect(TEMP_DEF dst, TEMP tmp);
2196- format %{ "reduce_mulD $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
2197- ins_encode %{
2198- __ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
2199- $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
2200- %}
2201- ins_pipe(pipe_slow);
2202- %}
2270+ %}' ) dnl
2271+ dnl
2272+ REDUCE_MUL_FP(FHF, F)
2273+ REDUCE_MUL_FP(D, D)
22032274
22042275dnl
22052276dnl REDUCE_BITWISE_OP_NEON($1, $2 $3 $4 )
0 commit comments