Skip to content

Commit 3384c67

Browse files
Bhavana KilambiJatin Bhateja
authored andcommitted
8366444: Add support for add/mul reduction operations for Float16
Reviewed-by: jbhateja, mchevalier, xgong, epeter
1 parent aece6f4 commit 3384c67

14 files changed

Lines changed: 819 additions & 68 deletions

File tree

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 82 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//
22
// Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3-
// Copyright (c) 2020, 2025, Arm Limited. All rights reserved.
3+
// Copyright (c) 2020, 2026, Arm Limited. All rights reserved.
44
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
//
66
// This code is free software; you can redistribute it and/or modify it
@@ -247,10 +247,39 @@ source %{
247247
case Op_MinVHF:
248248
case Op_MaxVHF:
249249
case Op_SqrtVHF:
250+
if (UseSVE == 0 && !is_feat_fp16_supported()) {
251+
return false;
252+
}
253+
break;
254+
// At the time of writing this, the Vector API has no half-float (FP16) species.
255+
// Consequently, AddReductionVHF and MulReductionVHF are only produced by the
256+
// auto-vectorizer, which requires strictly ordered semantics for FP reductions.
257+
//
258+
// There is no direct Neon instruction that performs strictly ordered floating
259+
// point add reduction. Hence, on Neon only machines, the add reduction operation
260+
// is implemented as a scalarized sequence using half-precision scalar instruction
261+
// FADD which requires FEAT_FP16 and ASIMDHP to be available on the target.
262+
// On SVE machines (UseSVE > 0) however, there is a direct instruction (FADDA) which
263+
// implements strictly ordered floating point add reduction which does not require
264+
// the FEAT_FP16 and ASIMDHP checks as SVE supports half-precision floats by default.
265+
case Op_AddReductionVHF:
250266
// FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
251267
// Only the Neon instructions need this check. SVE supports half-precision floats
252268
// by default.
253-
if (UseSVE == 0 && !is_feat_fp16_supported()) {
269+
if (length_in_bytes < 8 || (UseSVE == 0 && !is_feat_fp16_supported())) {
270+
return false;
271+
}
272+
break;
273+
case Op_MulReductionVHF:
274+
// There are no direct Neon/SVE instructions that perform strictly ordered
275+
// floating point multiply reduction.
276+
// For vector length ≤ 16 bytes, the reduction is implemented as a scalarized
277+
// sequence using half-precision scalar instruction FMUL. This path requires
278+
// FEAT_FP16 and ASIMDHP to be available on the target.
279+
// For vector length > 16 bytes, this operation is disabled because there is no
280+
// direct SVE instruction that performs a strictly ordered FP16 multiply
281+
// reduction.
282+
if (length_in_bytes < 8 || length_in_bytes > 16 || !is_feat_fp16_supported()) {
254283
return false;
255284
}
256285
break;
@@ -300,6 +329,7 @@ source %{
300329
case Op_VectorRearrange:
301330
case Op_MulReductionVD:
302331
case Op_MulReductionVF:
332+
case Op_MulReductionVHF:
303333
case Op_MulReductionVI:
304334
case Op_MulReductionVL:
305335
case Op_CompressBitsV:
@@ -364,6 +394,7 @@ source %{
364394
case Op_VectorMaskCmp:
365395
case Op_LoadVectorGather:
366396
case Op_StoreVectorScatter:
397+
case Op_AddReductionVHF:
367398
case Op_AddReductionVF:
368399
case Op_AddReductionVD:
369400
case Op_AndReductionV:
@@ -3402,6 +3433,44 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
34023433
ins_pipe(pipe_slow);
34033434
%}
34043435

3436+
// Add Reduction for Half floats (FP16).
3437+
// Neon does not provide direct instructions for strictly ordered floating-point add reductions.
3438+
// On Neon-only targets (UseSVE = 0), this operation is implemented as a sequence of scalar additions:
3439+
// values equal to the vector width are loaded into a vector register, each lane is extracted,
3440+
// and its value is accumulated into the running sum, producing a final scalar result.
3441+
instruct reduce_addHF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
3442+
predicate(UseSVE == 0);
3443+
match(Set dst (AddReductionVHF fsrc vsrc));
3444+
effect(TEMP_DEF dst, TEMP tmp);
3445+
format %{ "reduce_addHF $dst, $fsrc, $vsrc\t# 4HF/8HF. KILL $tmp" %}
3446+
ins_encode %{
3447+
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
3448+
__ neon_reduce_add_fp16($dst$$FloatRegister, $fsrc$$FloatRegister,
3449+
$vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
3450+
%}
3451+
ins_pipe(pipe_slow);
3452+
%}
3453+
3454+
// This rule calculates the reduction result in strict order. Two cases will
3455+
// reach here:
3456+
// 1. Non strictly-ordered AddReductionVHF when vector size > 128-bits. For example -
3457+
// AddReductionVHF generated by Vector API. For vector size > 128-bits, it is more
3458+
// beneficial performance-wise to generate direct SVE instruction even if it is
3459+
// strictly ordered.
3460+
// 2. Strictly-ordered AddReductionVHF. For example - AddReductionVHF generated by
3461+
// auto-vectorization on SVE machine.
3462+
instruct reduce_addHF_sve(vRegF dst_src1, vReg src2) %{
3463+
predicate(UseSVE > 0);
3464+
match(Set dst_src1 (AddReductionVHF dst_src1 src2));
3465+
format %{ "reduce_addHF_sve $dst_src1, $dst_src1, $src2" %}
3466+
ins_encode %{
3467+
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
3468+
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
3469+
__ sve_fadda($dst_src1$$FloatRegister, __ H, ptrue, $src2$$FloatRegister);
3470+
%}
3471+
ins_pipe(pipe_slow);
3472+
%}
3473+
34053474
// This rule calculates the reduction result in strict order. Two cases will
34063475
// reach here:
34073476
// 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
@@ -3492,12 +3561,14 @@ instruct reduce_addL_masked(iRegLNoSp dst, iRegL isrc, vReg vsrc, pRegGov pg, vR
34923561
ins_pipe(pipe_slow);
34933562
%}
34943563

3495-
instruct reduce_addF_masked(vRegF dst_src1, vReg src2, pRegGov pg) %{
3564+
instruct reduce_addFHF_masked(vRegF dst_src1, vReg src2, pRegGov pg) %{
34963565
predicate(UseSVE > 0);
3566+
match(Set dst_src1 (AddReductionVHF (Binary dst_src1 src2) pg));
34973567
match(Set dst_src1 (AddReductionVF (Binary dst_src1 src2) pg));
3498-
format %{ "reduce_addF_masked $dst_src1, $pg, $dst_src1, $src2" %}
3568+
format %{ "reduce_addFHF_masked $dst_src1, $pg, $dst_src1, $src2" %}
34993569
ins_encode %{
3500-
__ sve_fadda($dst_src1$$FloatRegister, __ S,
3570+
BasicType bt = Matcher::vector_element_basic_type(this, $src2);
3571+
__ sve_fadda($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
35013572
$pg$$PRegister, $src2$$FloatRegister);
35023573
%}
35033574
ins_pipe(pipe_slow);
@@ -3545,14 +3616,17 @@ instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
35453616
ins_pipe(pipe_slow);
35463617
%}
35473618

3548-
instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
3619+
3620+
instruct reduce_mulFHF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
35493621
predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
3622+
match(Set dst (MulReductionVHF fsrc vsrc));
35503623
match(Set dst (MulReductionVF fsrc vsrc));
35513624
effect(TEMP_DEF dst, TEMP tmp);
3552-
format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
3625+
format %{ "reduce_mulFHF $dst, $fsrc, $vsrc\t# 2F/4F/4HF/8HF. KILL $tmp" %}
35533626
ins_encode %{
35543627
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
3555-
__ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
3628+
BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
3629+
__ neon_reduce_mul_fp($dst$$FloatRegister, bt, $fsrc$$FloatRegister,
35563630
$vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
35573631
%}
35583632
ins_pipe(pipe_slow);

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 106 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//
22
// Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3-
// Copyright (c) 2020, 2025, Arm Limited. All rights reserved.
3+
// Copyright (c) 2020, 2026, Arm Limited. All rights reserved.
44
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
//
66
// This code is free software; you can redistribute it and/or modify it
@@ -237,10 +237,39 @@ source %{
237237
case Op_MinVHF:
238238
case Op_MaxVHF:
239239
case Op_SqrtVHF:
240+
if (UseSVE == 0 && !is_feat_fp16_supported()) {
241+
return false;
242+
}
243+
break;
244+
// At the time of writing this, the Vector API has no half-float (FP16) species.
245+
// Consequently, AddReductionVHF and MulReductionVHF are only produced by the
246+
// auto-vectorizer, which requires strictly ordered semantics for FP reductions.
247+
//
248+
// There is no direct Neon instruction that performs strictly ordered floating
249+
// point add reduction. Hence, on Neon only machines, the add reduction operation
250+
// is implemented as a scalarized sequence using half-precision scalar instruction
251+
// FADD which requires FEAT_FP16 and ASIMDHP to be available on the target.
252+
// On SVE machines (UseSVE > 0) however, there is a direct instruction (FADDA) which
253+
// implements strictly ordered floating point add reduction which does not require
254+
// the FEAT_FP16 and ASIMDHP checks as SVE supports half-precision floats by default.
255+
case Op_AddReductionVHF:
240256
// FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
241257
// Only the Neon instructions need this check. SVE supports half-precision floats
242258
// by default.
243-
if (UseSVE == 0 && !is_feat_fp16_supported()) {
259+
if (length_in_bytes < 8 || (UseSVE == 0 && !is_feat_fp16_supported())) {
260+
return false;
261+
}
262+
break;
263+
case Op_MulReductionVHF:
264+
// There are no direct Neon/SVE instructions that perform strictly ordered
265+
// floating point multiply reduction.
266+
// For vector length ≤ 16 bytes, the reduction is implemented as a scalarized
267+
// sequence using half-precision scalar instruction FMUL. This path requires
268+
// FEAT_FP16 and ASIMDHP to be available on the target.
269+
// For vector length > 16 bytes, this operation is disabled because there is no
270+
// direct SVE instruction that performs a strictly ordered FP16 multiply
271+
// reduction.
272+
if (length_in_bytes < 8 || length_in_bytes > 16 || !is_feat_fp16_supported()) {
244273
return false;
245274
}
246275
break;
@@ -290,6 +319,7 @@ source %{
290319
case Op_VectorRearrange:
291320
case Op_MulReductionVD:
292321
case Op_MulReductionVF:
322+
case Op_MulReductionVHF:
293323
case Op_MulReductionVI:
294324
case Op_MulReductionVL:
295325
case Op_CompressBitsV:
@@ -354,6 +384,7 @@ source %{
354384
case Op_VectorMaskCmp:
355385
case Op_LoadVectorGather:
356386
case Op_StoreVectorScatter:
387+
case Op_AddReductionVHF:
357388
case Op_AddReductionVF:
358389
case Op_AddReductionVD:
359390
case Op_AndReductionV:
@@ -2063,6 +2094,25 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR
20632094
ins_pipe(pipe_slow);
20642095
%}
20652096
dnl
2097+
2098+
// Add Reduction for Half floats (FP16).
2099+
// Neon does not provide direct instructions for strictly ordered floating-point add reductions.
2100+
// On Neon-only targets (UseSVE = 0), this operation is implemented as a sequence of scalar additions:
2101+
// values equal to the vector width are loaded into a vector register, each lane is extracted,
2102+
// and its value is accumulated into the running sum, producing a final scalar result.
2103+
instruct reduce_addHF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2104+
predicate(UseSVE == 0);
2105+
match(Set dst (AddReductionVHF fsrc vsrc));
2106+
effect(TEMP_DEF dst, TEMP tmp);
2107+
format %{ "reduce_addHF $dst, $fsrc, $vsrc\t# 4HF/8HF. KILL $tmp" %}
2108+
ins_encode %{
2109+
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
2110+
__ neon_reduce_add_fp16($dst$$FloatRegister, $fsrc$$FloatRegister,
2111+
$vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
2112+
%}
2113+
ins_pipe(pipe_slow);
2114+
%}
2115+
dnl
20662116
dnl REDUCE_ADD_FP_SVE($1, $2 )
20672117
dnl REDUCE_ADD_FP_SVE(type, size)
20682118
define(`REDUCE_ADD_FP_SVE', `
@@ -2074,21 +2124,26 @@ define(`REDUCE_ADD_FP_SVE', `
20742124
// strictly ordered.
20752125
// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
20762126
// auto-vectorization on SVE machine.
2077-
instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
2078-
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
2079-
n->as_Reduction()->requires_strict_order());
2127+
instruct reduce_add$1_sve(vReg`'ifelse($1, HF, F, $1) dst_src1, vReg src2) %{
2128+
ifelse($1, HF,
2129+
`predicate(UseSVE > 0);',
2130+
`predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
2131+
n->as_Reduction()->requires_strict_order());')
20802132
match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
20812133
format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
20822134
ins_encode %{
2083-
assert(UseSVE > 0, "must be sve");
2084-
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
2135+
ifelse($1, HF, `',
2136+
`assert(UseSVE > 0, "must be sve");
2137+
')dnl
2138+
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
20852139
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
20862140
__ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
20872141
%}
20882142
ins_pipe(pipe_slow);
20892143
%}')dnl
20902144
dnl
2091-
REDUCE_ADD_FP_SVE(F, S)
2145+
REDUCE_ADD_FP_SVE(HF, H)
2146+
REDUCE_ADD_FP_SVE(F, S)
20922147

20932148
// reduction addD
20942149

@@ -2129,21 +2184,30 @@ dnl
21292184
dnl REDUCE_ADD_FP_PREDICATE($1, $2 )
21302185
dnl REDUCE_ADD_FP_PREDICATE(insn_name, op_name)
21312186
define(`REDUCE_ADD_FP_PREDICATE', `
2132-
instruct reduce_add$1_masked(vReg$1 dst_src1, vReg src2, pRegGov pg) %{
2187+
instruct reduce_add$1_masked(vReg$2 dst_src1, vReg src2, pRegGov pg) %{
21332188
predicate(UseSVE > 0);
2134-
match(Set dst_src1 (AddReductionV$1 (Binary dst_src1 src2) pg));
2189+
ifelse($2, F,
2190+
`match(Set dst_src1 (AddReductionVHF (Binary dst_src1 src2) pg));
2191+
match(Set dst_src1 (AddReductionV$2 (Binary dst_src1 src2) pg));',
2192+
`match(Set dst_src1 (AddReductionV$2 (Binary dst_src1 src2) pg));')
21352193
format %{ "reduce_add$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
21362194
ins_encode %{
2137-
__ sve_fadda($dst_src1$$FloatRegister, __ $2,
2138-
$pg$$PRegister, $src2$$FloatRegister);
2195+
ifelse($2, F,
2196+
`BasicType bt = Matcher::vector_element_basic_type(this, $src2);
2197+
',)dnl
2198+
ifelse($2, F,
2199+
`__ sve_fadda($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
2200+
$pg$$PRegister, $src2$$FloatRegister);',
2201+
`__ sve_fadda($dst_src1$$FloatRegister, __ $2,
2202+
$pg$$PRegister, $src2$$FloatRegister);')
21392203
%}
21402204
ins_pipe(pipe_slow);
21412205
%}')dnl
21422206
dnl
21432207
REDUCE_ADD_INT_PREDICATE(I, iRegIorL2I)
21442208
REDUCE_ADD_INT_PREDICATE(L, iRegL)
2145-
REDUCE_ADD_FP_PREDICATE(F, S)
2146-
REDUCE_ADD_FP_PREDICATE(D, D)
2209+
REDUCE_ADD_FP_PREDICATE(FHF, F)
2210+
REDUCE_ADD_FP_PREDICATE(D, D)
21472211

21482212
// ------------------------------ Vector reduction mul -------------------------
21492213

@@ -2176,30 +2240,37 @@ instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
21762240
ins_pipe(pipe_slow);
21772241
%}
21782242

2179-
instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2180-
predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
2181-
match(Set dst (MulReductionVF fsrc vsrc));
2243+
dnl REDUCE_MUL_FP($1, $2 )
2244+
dnl REDUCE_MUL_FP(insn_name, op_name)
2245+
define(`REDUCE_MUL_FP', `
2246+
instruct reduce_mul$1(vReg$2 dst, vReg$2 ifelse($2, F, fsrc, dsrc), vReg vsrc, vReg tmp) %{
2247+
predicate(Matcher::vector_length_in_bytes(n->in(2)) ifelse($2, F, <=, ==) 16);
2248+
ifelse($2, F,
2249+
`match(Set dst (MulReductionVHF fsrc vsrc));
2250+
match(Set dst (MulReductionV$2 fsrc vsrc));',
2251+
`match(Set dst (MulReductionV$2 dsrc vsrc));')
21822252
effect(TEMP_DEF dst, TEMP tmp);
2183-
format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
2184-
ins_encode %{
2185-
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
2186-
__ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
2187-
$vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
2253+
ifelse($2, F,
2254+
`format %{ "reduce_mul$1 $dst, $fsrc, $vsrc\t# 2F/4F/4HF/8HF. KILL $tmp" %}',
2255+
`format %{ "reduce_mul$1 $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}')
2256+
ins_encode %{
2257+
ifelse($2, F,
2258+
`uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
2259+
',)dnl
2260+
ifelse($2, F,
2261+
`BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
2262+
',)dnl
2263+
ifelse($2, F,
2264+
`__ neon_reduce_mul_fp($dst$$FloatRegister, bt, $fsrc$$FloatRegister,
2265+
$vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);',
2266+
`__ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
2267+
$vsrc$$FloatRegister, 16, $tmp$$FloatRegister);')
21882268
%}
21892269
ins_pipe(pipe_slow);
2190-
%}
2191-
2192-
instruct reduce_mulD(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
2193-
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
2194-
match(Set dst (MulReductionVD dsrc vsrc));
2195-
effect(TEMP_DEF dst, TEMP tmp);
2196-
format %{ "reduce_mulD $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
2197-
ins_encode %{
2198-
__ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
2199-
$vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
2200-
%}
2201-
ins_pipe(pipe_slow);
2202-
%}
2270+
%}')dnl
2271+
dnl
2272+
REDUCE_MUL_FP(FHF, F)
2273+
REDUCE_MUL_FP(D, D)
22032274

22042275
dnl
22052276
dnl REDUCE_BITWISE_OP_NEON($1, $2 $3 $4 )

0 commit comments

Comments
 (0)