|
1 | 1 | // |
2 | 2 | // Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. |
3 | | -// Copyright (c) 2020, 2025, Arm Limited. All rights reserved. |
| 3 | +// Copyright (c) 2020, 2026, Arm Limited. All rights reserved. |
4 | 4 | // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
5 | 5 | // |
6 | 6 | // This code is free software; you can redistribute it and/or modify it |
@@ -247,10 +247,39 @@ source %{ |
247 | 247 | case Op_MinVHF: |
248 | 248 | case Op_MaxVHF: |
249 | 249 | case Op_SqrtVHF: |
| 250 | + if (UseSVE == 0 && !is_feat_fp16_supported()) { |
| 251 | + return false; |
| 252 | + } |
| 253 | + break; |
| 254 | + // At the time of writing this, the Vector API has no half-float (FP16) species. |
| 255 | + // Consequently, AddReductionVHF and MulReductionVHF are only produced by the |
| 256 | + // auto-vectorizer, which requires strictly ordered semantics for FP reductions. |
| 257 | + // |
| 258 | + // There is no direct Neon instruction that performs strictly ordered floating |
| 259 | + // point add reduction. Hence, on Neon only machines, the add reduction operation |
| 260 | + // is implemented as a scalarized sequence using half-precision scalar instruction |
| 261 | + // FADD which requires FEAT_FP16 and ASIMDHP to be available on the target. |
| 262 | + // On SVE machines (UseSVE > 0) however, there is a direct instruction (FADDA) which |
| 263 | + // implements strictly ordered floating point add reduction which does not require |
| 264 | + // the FEAT_FP16 and ASIMDHP checks as SVE supports half-precision floats by default. |
| 265 | + case Op_AddReductionVHF: |
250 | 266 | // FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported. |
251 | 267 | // Only the Neon instructions need this check. SVE supports half-precision floats |
252 | 268 | // by default. |
253 | | - if (UseSVE == 0 && !is_feat_fp16_supported()) { |
| 269 | + if (length_in_bytes < 8 || (UseSVE == 0 && !is_feat_fp16_supported())) { |
| 270 | + return false; |
| 271 | + } |
| 272 | + break; |
| 273 | + case Op_MulReductionVHF: |
| 274 | + // There are no direct Neon/SVE instructions that perform strictly ordered |
| 275 | + // floating point multiply reduction. |
| 276 | + // For vector length ≤ 16 bytes, the reduction is implemented as a scalarized |
| 277 | + // sequence using half-precision scalar instruction FMUL. This path requires |
| 278 | + // FEAT_FP16 and ASIMDHP to be available on the target. |
| 279 | + // For vector length > 16 bytes, this operation is disabled because there is no |
| 280 | + // direct SVE instruction that performs a strictly ordered FP16 multiply |
| 281 | + // reduction. |
| 282 | + if (length_in_bytes < 8 || length_in_bytes > 16 || !is_feat_fp16_supported()) { |
254 | 283 | return false; |
255 | 284 | } |
256 | 285 | break; |
@@ -300,6 +329,7 @@ source %{ |
300 | 329 | case Op_VectorRearrange: |
301 | 330 | case Op_MulReductionVD: |
302 | 331 | case Op_MulReductionVF: |
| 332 | + case Op_MulReductionVHF: |
303 | 333 | case Op_MulReductionVI: |
304 | 334 | case Op_MulReductionVL: |
305 | 335 | case Op_CompressBitsV: |
@@ -364,6 +394,7 @@ source %{ |
364 | 394 | case Op_VectorMaskCmp: |
365 | 395 | case Op_LoadVectorGather: |
366 | 396 | case Op_StoreVectorScatter: |
| 397 | + case Op_AddReductionVHF: |
367 | 398 | case Op_AddReductionVF: |
368 | 399 | case Op_AddReductionVD: |
369 | 400 | case Op_AndReductionV: |
@@ -597,13 +628,9 @@ instruct vloadcon(vReg dst, immI0 src) %{ |
597 | 628 | BasicType bt = Matcher::vector_element_basic_type(this); |
598 | 629 | if (UseSVE == 0) { |
599 | 630 | uint length_in_bytes = Matcher::vector_length_in_bytes(this); |
| 631 | + int entry_idx = __ vector_iota_entry_index(bt); |
600 | 632 | assert(length_in_bytes <= 16, "must be"); |
601 | | - // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 16. |
602 | | - int offset = exact_log2(type2aelembytes(bt)) << 4; |
603 | | - if (is_floating_point_type(bt)) { |
604 | | - offset += 32; |
605 | | - } |
606 | | - __ lea(rscratch1, ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + offset)); |
| 633 | + __ lea(rscratch1, ExternalAddress(StubRoutines::aarch64::vector_iota_indices(entry_idx))); |
607 | 634 | if (length_in_bytes == 16) { |
608 | 635 | __ ldrq($dst$$FloatRegister, rscratch1); |
609 | 636 | } else { |
@@ -3406,6 +3433,44 @@ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vR |
3406 | 3433 | ins_pipe(pipe_slow); |
3407 | 3434 | %} |
3408 | 3435 |
|
| 3436 | +// Add Reduction for Half floats (FP16). |
| 3437 | +// Neon does not provide direct instructions for strictly ordered floating-point add reductions. |
| 3438 | +// On Neon-only targets (UseSVE = 0), this operation is implemented as a sequence of scalar additions: |
| 3439 | +// values equal to the vector width are loaded into a vector register, each lane is extracted, |
| 3440 | +// and its value is accumulated into the running sum, producing a final scalar result. |
| 3441 | +instruct reduce_addHF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ |
| 3442 | + predicate(UseSVE == 0); |
| 3443 | + match(Set dst (AddReductionVHF fsrc vsrc)); |
| 3444 | + effect(TEMP_DEF dst, TEMP tmp); |
| 3445 | + format %{ "reduce_addHF $dst, $fsrc, $vsrc\t# 4HF/8HF. KILL $tmp" %} |
| 3446 | + ins_encode %{ |
| 3447 | + uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); |
| 3448 | + __ neon_reduce_add_fp16($dst$$FloatRegister, $fsrc$$FloatRegister, |
| 3449 | + $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister); |
| 3450 | + %} |
| 3451 | + ins_pipe(pipe_slow); |
| 3452 | +%} |
| 3453 | + |
| 3454 | +// This rule calculates the reduction result in strict order. Two cases will |
| 3455 | +// reach here: |
| 3456 | +// 1. Non strictly-ordered AddReductionVHF when vector size > 128-bits. For example - |
| 3457 | +// AddReductionVHF generated by Vector API. For vector size > 128-bits, it is more |
| 3458 | +// beneficial performance-wise to generate direct SVE instruction even if it is |
| 3459 | +// strictly ordered. |
| 3460 | +// 2. Strictly-ordered AddReductionVHF. For example - AddReductionVHF generated by |
| 3461 | +// auto-vectorization on SVE machine. |
| 3462 | +instruct reduce_addHF_sve(vRegF dst_src1, vReg src2) %{ |
| 3463 | + predicate(UseSVE > 0); |
| 3464 | + match(Set dst_src1 (AddReductionVHF dst_src1 src2)); |
| 3465 | + format %{ "reduce_addHF_sve $dst_src1, $dst_src1, $src2" %} |
| 3466 | + ins_encode %{ |
| 3467 | + uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2); |
| 3468 | + assert(length_in_bytes == MaxVectorSize, "invalid vector length"); |
| 3469 | + __ sve_fadda($dst_src1$$FloatRegister, __ H, ptrue, $src2$$FloatRegister); |
| 3470 | + %} |
| 3471 | + ins_pipe(pipe_slow); |
| 3472 | +%} |
| 3473 | + |
3409 | 3474 | // This rule calculates the reduction result in strict order. Two cases will |
3410 | 3475 | // reach here: |
3411 | 3476 | // 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example - |
@@ -3496,12 +3561,14 @@ instruct reduce_addL_masked(iRegLNoSp dst, iRegL isrc, vReg vsrc, pRegGov pg, vR |
3496 | 3561 | ins_pipe(pipe_slow); |
3497 | 3562 | %} |
3498 | 3563 |
|
3499 | | -instruct reduce_addF_masked(vRegF dst_src1, vReg src2, pRegGov pg) %{ |
| 3564 | +instruct reduce_addFHF_masked(vRegF dst_src1, vReg src2, pRegGov pg) %{ |
3500 | 3565 | predicate(UseSVE > 0); |
| 3566 | + match(Set dst_src1 (AddReductionVHF (Binary dst_src1 src2) pg)); |
3501 | 3567 | match(Set dst_src1 (AddReductionVF (Binary dst_src1 src2) pg)); |
3502 | | - format %{ "reduce_addF_masked $dst_src1, $pg, $dst_src1, $src2" %} |
| 3568 | + format %{ "reduce_addFHF_masked $dst_src1, $pg, $dst_src1, $src2" %} |
3503 | 3569 | ins_encode %{ |
3504 | | - __ sve_fadda($dst_src1$$FloatRegister, __ S, |
| 3570 | + BasicType bt = Matcher::vector_element_basic_type(this, $src2); |
| 3571 | + __ sve_fadda($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt), |
3505 | 3572 | $pg$$PRegister, $src2$$FloatRegister); |
3506 | 3573 | %} |
3507 | 3574 | ins_pipe(pipe_slow); |
@@ -3549,14 +3616,17 @@ instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{ |
3549 | 3616 | ins_pipe(pipe_slow); |
3550 | 3617 | %} |
3551 | 3618 |
|
3552 | | -instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ |
| 3619 | + |
| 3620 | +instruct reduce_mulFHF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ |
3553 | 3621 | predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16); |
| 3622 | + match(Set dst (MulReductionVHF fsrc vsrc)); |
3554 | 3623 | match(Set dst (MulReductionVF fsrc vsrc)); |
3555 | 3624 | effect(TEMP_DEF dst, TEMP tmp); |
3556 | | - format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %} |
| 3625 | + format %{ "reduce_mulFHF $dst, $fsrc, $vsrc\t# 2F/4F/4HF/8HF. KILL $tmp" %} |
3557 | 3626 | ins_encode %{ |
3558 | 3627 | uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); |
3559 | | - __ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister, |
| 3628 | + BasicType bt = Matcher::vector_element_basic_type(this, $vsrc); |
| 3629 | + __ neon_reduce_mul_fp($dst$$FloatRegister, bt, $fsrc$$FloatRegister, |
3560 | 3630 | $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister); |
3561 | 3631 | %} |
3562 | 3632 | ins_pipe(pipe_slow); |
|
0 commit comments