@@ -85,16 +85,81 @@ typedef struct rarch_sinc_resampler
8585 float kaiser_beta ;
8686} rarch_sinc_resampler_t ;
8787
88- #if (defined(__ARM_NEON__ ) && !defined(DONT_WANT_ARM_OPTIMIZATIONS )) || defined(HAVE_NEON )
89- #if TARGET_OS_IPHONE
90- #else
91- #ifndef WANT_NEON
92- #define WANT_NEON
93- #endif
94- #endif
95- #endif
88+ #if (defined(__ARM_NEON__ ) || defined(HAVE_NEON ))
89+ #ifdef DONT_WANT_ARM_ASM_OPTIMIZATIONS
90+ #include <arm_neon.h>
9691
97- #ifdef WANT_NEON
92+ /* Assumes that taps >= 8, and that taps is a multiple of 8. */
93+ static void resampler_sinc_process_neon_intrin (void * re_ , struct resampler_data * data )
94+ {
95+ rarch_sinc_resampler_t * resamp = (rarch_sinc_resampler_t * )re_ ;
96+ unsigned phases = 1 << (resamp -> phase_bits + resamp -> subphase_bits );
97+
98+ uint32_t ratio = phases / data -> ratio ;
99+ const float * input = data -> data_in ;
100+ float * output = data -> data_out ;
101+ size_t frames = data -> input_frames ;
102+ size_t out_frames = 0 ;
103+
104+ while (frames )
105+ {
106+ while (resamp -> time >= phases )
107+ {
108+ /* Push in reverse to make filter more obvious. */
109+ if (!resamp -> ptr )
110+ resamp -> ptr = resamp -> taps ;
111+ resamp -> ptr -- ;
112+
113+ resamp -> buffer_l [resamp -> ptr + resamp -> taps ] =
114+ resamp -> buffer_l [resamp -> ptr ] = * input ++ ;
115+
116+ resamp -> buffer_r [resamp -> ptr + resamp -> taps ] =
117+ resamp -> buffer_r [resamp -> ptr ] = * input ++ ;
118+
119+ resamp -> time -= phases ;
120+ frames -- ;
121+ }
122+
123+ {
124+ const float * buffer_l = resamp -> buffer_l + resamp -> ptr ;
125+ const float * buffer_r = resamp -> buffer_r + resamp -> ptr ;
126+ unsigned taps = resamp -> taps ;
127+ while (resamp -> time < phases )
128+ {
129+ int i ;
130+ unsigned phase = resamp -> time >> resamp -> subphase_bits ;
131+ const float * phase_table = resamp -> phase_table + phase * taps ;
132+
133+ float32x4_t p1 = {0 , 0 , 0 , 0 }, p2 = {0 , 0 , 0 , 0 };
134+ float32x2_t p3 , p4 ;
135+
136+ for (i = 0 ; i < taps ; i += 8 )
137+ {
138+ float32x4x2_t coeff8 = vld2q_f32 (& phase_table [i ]);
139+ float32x4x2_t left8 = vld2q_f32 (& buffer_l [i ]);
140+ float32x4x2_t right8 = vld2q_f32 (& buffer_r [i ]);
141+
142+ p1 = vmlaq_f32 (p1 , left8 .val [0 ], coeff8 .val [0 ]);
143+ p2 = vmlaq_f32 (p2 , right8 .val [0 ], coeff8 .val [0 ]);
144+ p1 = vmlaq_f32 (p1 , left8 .val [1 ], coeff8 .val [1 ]);
145+ p2 = vmlaq_f32 (p2 , right8 .val [1 ], coeff8 .val [1 ]);
146+ }
147+
148+ p3 = vadd_f32 (vget_low_f32 (p1 ), vget_high_f32 (p1 ));
149+ p4 = vadd_f32 (vget_low_f32 (p2 ), vget_high_f32 (p2 ));
150+ vst1_f32 (output , vpadd_f32 (p3 , p4 ));
151+
152+
153+ output += 2 ;
154+ out_frames ++ ;
155+ resamp -> time += ratio ;
156+ }
157+ }
158+ }
159+
160+ data -> output_frames = out_frames ;
161+ }
162+ #else
98163/* Assumes that taps >= 8, and that taps is a multiple of 8. */
99164void process_sinc_neon_asm (float * out , const float * left ,
100165 const float * right , const float * coeff , unsigned taps );
@@ -150,6 +215,7 @@ static void resampler_sinc_process_neon(void *re_, struct resampler_data *data)
150215 data -> output_frames = out_frames ;
151216}
152217#endif
218+ #endif
153219
154220#if defined(__AVX__ )
155221static void resampler_sinc_process_avx_kaiser (void * re_ , struct resampler_data * data )
@@ -844,7 +910,7 @@ static void *resampler_sinc_new(const struct resampler_config *config,
844910 else
845911#endif
846912 {
847- #if defined(WANT_NEON )
913+ #if ( defined(__ARM_NEON__ ) || defined( HAVE_NEON ) )
848914 re -> taps = (re -> taps + 7 ) & ~7 ;
849915#else
850916 re -> taps = (re -> taps + 3 ) & ~3 ;
@@ -902,8 +968,12 @@ static void *resampler_sinc_new(const struct resampler_config *config,
902968 }
903969 else if (mask & RESAMPLER_SIMD_NEON && window_type != SINC_WINDOW_KAISER )
904970 {
905- #if defined(WANT_NEON )
971+ #if (defined(__ARM_NEON__ ) || defined(HAVE_NEON ))
972+ #ifdef DONT_WANT_ARM_ASM_OPTIMIZATIONS
973+ sinc_resampler .process = resampler_sinc_process_neon_intrin ;
974+ #else
906975 sinc_resampler .process = resampler_sinc_process_neon ;
976+ #endif
907977#endif
908978 }
909979
@@ -922,5 +992,3 @@ retro_resampler_t sinc_resampler = {
922992 "sinc" ,
923993 "sinc"
924994};
925-
926- #undef WANT_NEON
0 commit comments