@@ -346,27 +346,65 @@ class SimulatorNEON final : public SimulatorBase {
346346 is[k] = vld1q_f32 (p0 + xss[k] + 4 );
347347 }
348348
349- uint64_t j = 0 ;
349+ unsigned k = 0 ;
350+
351+ for (; k + 1 < hsize; k += 2 ) {
352+ const fp_type* v0 = v + 2 * k * hsize;
353+ const fp_type* v1 = v0 + 2 * hsize;
354+
355+ float32x4_t ru0 = vdupq_n_f32 (v0[0 ]);
356+ float32x4_t iu0 = vdupq_n_f32 (v0[1 ]);
357+ float32x4_t rn0 = vmulq_f32 (rs[0 ], ru0);
358+ float32x4_t in0 = vmulq_f32 (rs[0 ], iu0);
359+ rn0 = vfmsq_f32 (rn0, is[0 ], iu0);
360+ in0 = vfmaq_f32 (in0, is[0 ], ru0);
361+
362+ float32x4_t ru1 = vdupq_n_f32 (v1[0 ]);
363+ float32x4_t iu1 = vdupq_n_f32 (v1[1 ]);
364+ float32x4_t rn1 = vmulq_f32 (rs[0 ], ru1);
365+ float32x4_t in1 = vmulq_f32 (rs[0 ], iu1);
366+ rn1 = vfmsq_f32 (rn1, is[0 ], iu1);
367+ in1 = vfmaq_f32 (in1, is[0 ], ru1);
350368
351- for (unsigned k = 0 ; k < hsize; ++k) {
352- float32x4_t ru = vdupq_n_f32 (v[j]);
353- float32x4_t iu = vdupq_n_f32 (v[j + 1 ]);
369+ for (unsigned l = 1 ; l < hsize; ++l) {
370+ ru0 = vdupq_n_f32 (v0[2 * l]);
371+ iu0 = vdupq_n_f32 (v0[2 * l + 1 ]);
372+ rn0 = vfmaq_f32 (rn0, rs[l], ru0);
373+ in0 = vfmaq_f32 (in0, rs[l], iu0);
374+ rn0 = vfmsq_f32 (rn0, is[l], iu0);
375+ in0 = vfmaq_f32 (in0, is[l], ru0);
376+
377+ ru1 = vdupq_n_f32 (v1[2 * l]);
378+ iu1 = vdupq_n_f32 (v1[2 * l + 1 ]);
379+ rn1 = vfmaq_f32 (rn1, rs[l], ru1);
380+ in1 = vfmaq_f32 (in1, rs[l], iu1);
381+ rn1 = vfmsq_f32 (rn1, is[l], iu1);
382+ in1 = vfmaq_f32 (in1, is[l], ru1);
383+ }
384+
385+ vst1q_f32 (p0 + xss[k], rn0);
386+ vst1q_f32 (p0 + xss[k] + 4 , in0);
387+ vst1q_f32 (p0 + xss[k + 1 ], rn1);
388+ vst1q_f32 (p0 + xss[k + 1 ] + 4 , in1);
389+ }
390+
391+ for (; k < hsize; ++k) {
392+ const fp_type* vk = v + 2 * k * hsize;
393+
394+ float32x4_t ru = vdupq_n_f32 (vk[0 ]);
395+ float32x4_t iu = vdupq_n_f32 (vk[1 ]);
354396 float32x4_t rn = vmulq_f32 (rs[0 ], ru);
355397 float32x4_t in = vmulq_f32 (rs[0 ], iu);
356398 rn = vfmsq_f32 (rn, is[0 ], iu);
357399 in = vfmaq_f32 (in, is[0 ], ru);
358400
359- j += 2 ;
360-
361401 for (unsigned l = 1 ; l < hsize; ++l) {
362- ru = vdupq_n_f32 (v[j ]);
363- iu = vdupq_n_f32 (v[j + 1 ]);
402+ ru = vdupq_n_f32 (vk[ 2 * l ]);
403+ iu = vdupq_n_f32 (vk[ 2 * l + 1 ]);
364404 rn = vfmaq_f32 (rn, rs[l], ru);
365405 in = vfmaq_f32 (in, rs[l], iu);
366406 rn = vfmsq_f32 (rn, is[l], iu);
367407 in = vfmaq_f32 (in, is[l], ru);
368-
369- j += 2 ;
370408 }
371409
372410 vst1q_f32 (p0 + xss[k], rn);
0 commit comments