Skip to content

Commit d005491

Browse files
author
XIN XIE
committed
Unroll NEON ApplyGateH output rows
1 parent e2211df commit d005491

1 file changed

Lines changed: 48 additions & 10 deletions

File tree

lib/simulator_neon.h

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -346,27 +346,65 @@ class SimulatorNEON final : public SimulatorBase {
346346
is[k] = vld1q_f32(p0 + xss[k] + 4);
347347
}
348348

349-
uint64_t j = 0;
349+
unsigned k = 0;
350+
351+
for (; k + 1 < hsize; k += 2) {
352+
const fp_type* v0 = v + 2 * k * hsize;
353+
const fp_type* v1 = v0 + 2 * hsize;
354+
355+
float32x4_t ru0 = vdupq_n_f32(v0[0]);
356+
float32x4_t iu0 = vdupq_n_f32(v0[1]);
357+
float32x4_t rn0 = vmulq_f32(rs[0], ru0);
358+
float32x4_t in0 = vmulq_f32(rs[0], iu0);
359+
rn0 = vfmsq_f32(rn0, is[0], iu0);
360+
in0 = vfmaq_f32(in0, is[0], ru0);
361+
362+
float32x4_t ru1 = vdupq_n_f32(v1[0]);
363+
float32x4_t iu1 = vdupq_n_f32(v1[1]);
364+
float32x4_t rn1 = vmulq_f32(rs[0], ru1);
365+
float32x4_t in1 = vmulq_f32(rs[0], iu1);
366+
rn1 = vfmsq_f32(rn1, is[0], iu1);
367+
in1 = vfmaq_f32(in1, is[0], ru1);
350368

351-
for (unsigned k = 0; k < hsize; ++k) {
352-
float32x4_t ru = vdupq_n_f32(v[j]);
353-
float32x4_t iu = vdupq_n_f32(v[j + 1]);
369+
for (unsigned l = 1; l < hsize; ++l) {
370+
ru0 = vdupq_n_f32(v0[2 * l]);
371+
iu0 = vdupq_n_f32(v0[2 * l + 1]);
372+
rn0 = vfmaq_f32(rn0, rs[l], ru0);
373+
in0 = vfmaq_f32(in0, rs[l], iu0);
374+
rn0 = vfmsq_f32(rn0, is[l], iu0);
375+
in0 = vfmaq_f32(in0, is[l], ru0);
376+
377+
ru1 = vdupq_n_f32(v1[2 * l]);
378+
iu1 = vdupq_n_f32(v1[2 * l + 1]);
379+
rn1 = vfmaq_f32(rn1, rs[l], ru1);
380+
in1 = vfmaq_f32(in1, rs[l], iu1);
381+
rn1 = vfmsq_f32(rn1, is[l], iu1);
382+
in1 = vfmaq_f32(in1, is[l], ru1);
383+
}
384+
385+
vst1q_f32(p0 + xss[k], rn0);
386+
vst1q_f32(p0 + xss[k] + 4, in0);
387+
vst1q_f32(p0 + xss[k + 1], rn1);
388+
vst1q_f32(p0 + xss[k + 1] + 4, in1);
389+
}
390+
391+
for (; k < hsize; ++k) {
392+
const fp_type* vk = v + 2 * k * hsize;
393+
394+
float32x4_t ru = vdupq_n_f32(vk[0]);
395+
float32x4_t iu = vdupq_n_f32(vk[1]);
354396
float32x4_t rn = vmulq_f32(rs[0], ru);
355397
float32x4_t in = vmulq_f32(rs[0], iu);
356398
rn = vfmsq_f32(rn, is[0], iu);
357399
in = vfmaq_f32(in, is[0], ru);
358400

359-
j += 2;
360-
361401
for (unsigned l = 1; l < hsize; ++l) {
362-
ru = vdupq_n_f32(v[j]);
363-
iu = vdupq_n_f32(v[j + 1]);
402+
ru = vdupq_n_f32(vk[2 * l]);
403+
iu = vdupq_n_f32(vk[2 * l + 1]);
364404
rn = vfmaq_f32(rn, rs[l], ru);
365405
in = vfmaq_f32(in, rs[l], iu);
366406
rn = vfmsq_f32(rn, is[l], iu);
367407
in = vfmaq_f32(in, is[l], ru);
368-
369-
j += 2;
370408
}
371409

372410
vst1q_f32(p0 + xss[k], rn);

0 commit comments

Comments
 (0)