.text .global _sve_support .align 4 _sve_support: ptrue p0.b fmla z0.s, p0/m, z30.s, z31.s ret .global _sve_streaming_support .align 4 _sve_streaming_support: smstart ptrue p0.b fmla z0.s, p0/m, z30.s, z31.s smstop ret .global _sme_support .align 4 _sme_support: smstart ptrue p0.b ptrue p1.b fmopa za0.s, p0/m, p1/m, z0.s, z1.s smstop ret .global _sve_streaming_vlength .align 4 _sve_streaming_vlength: smstart ldr z0, [x0] str z0, [x1] smstop ret .align 4 .global _peak_neon_fmla _peak_neon_fmla: stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! eor v0.16b, v0.16b, v0.16b eor v1.16b, v1.16b, v1.16b eor v2.16b, v2.16b, v2.16b eor v3.16b, v3.16b, v3.16b eor v4.16b, v4.16b, v4.16b eor v5.16b, v5.16b, v5.16b eor v6.16b, v6.16b, v6.16b eor v7.16b, v7.16b, v7.16b eor v8.16b, v8.16b, v8.16b eor v9.16b, v9.16b, v9.16b eor v10.16b, v10.16b, v10.16b eor v11.16b, v11.16b, v11.16b eor v12.16b, v12.16b, v12.16b eor v13.16b, v13.16b, v13.16b eor v14.16b, v14.16b, v14.16b eor v15.16b, v15.16b, v15.16b eor v16.16b, v16.16b, v16.16b eor v17.16b, v17.16b, v17.16b eor v18.16b, v18.16b, v18.16b eor v19.16b, v19.16b, v19.16b eor v20.16b, v20.16b, v20.16b eor v21.16b, v21.16b, v21.16b eor v22.16b, v22.16b, v22.16b eor v23.16b, v23.16b, v23.16b eor v24.16b, v24.16b, v24.16b eor v25.16b, v25.16b, v25.16b eor v26.16b, v26.16b, v26.16b eor v27.16b, v27.16b, v27.16b eor v28.16b, v28.16b, v28.16b eor v29.16b, v29.16b, v29.16b eor v30.16b, v30.16b, v30.16b eor v31.16b, v31.16b, v31.16b loop_peak_neon_fmla: sub x0,x0, #1 fmla v0.4s, v30.4s, v31.4s fmla v1.4s, v30.4s, v31.4s fmla v2.4s, v30.4s, v31.4s fmla v3.4s, v30.4s, v31.4s fmla v4.4s, v30.4s, v31.4s fmla v5.4s, v30.4s, v31.4s fmla v6.4s, v30.4s, v31.4s fmla v7.4s, v30.4s, v31.4s fmla v8.4s, v30.4s, v31.4s fmla v9.4s, v30.4s, v31.4s fmla v10.4s, v30.4s, v31.4s fmla v11.4s, v30.4s, v31.4s fmla v12.4s, v30.4s, v31.4s fmla v13.4s, v30.4s, v31.4s fmla v14.4s, v30.4s, v31.4s fmla v15.4s, v30.4s, v31.4s fmla v16.4s, v30.4s, v31.4s fmla v17.4s, v30.4s, v31.4s fmla v18.4s, v30.4s, v31.4s fmla v19.4s, v30.4s, v31.4s fmla v20.4s, v30.4s, v31.4s fmla v21.4s, v30.4s, v31.4s fmla v22.4s, v30.4s, v31.4s fmla v23.4s, v30.4s, v31.4s fmla v24.4s, v30.4s, v31.4s fmla v25.4s, v30.4s, v31.4s fmla v26.4s, v30.4s, v31.4s fmla v27.4s, v30.4s, v31.4s fmla v28.4s, v30.4s, v31.4s fmla v29.4s, v30.4s, v31.4s cbnz x0, loop_peak_neon_fmla ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 mov x0, 30*8 ret .global _peak_sve_fmla_streaming .align 4 _peak_sve_fmla_streaming: stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! smstart ptrue p0.b loop_peak_sve_fmla_streaming: sub x0, x0, #1 fmla z0.s, p0/m, z30.s, z31.s fmla z1.s, p0/m, z30.s, z31.s fmla z2.s, p0/m, z30.s, z31.s fmla z3.s, p0/m, z30.s, z31.s fmla z4.s, p0/m, z30.s, z31.s fmla z5.s, p0/m, z30.s, z31.s fmla z6.s, p0/m, z30.s, z31.s fmla z7.s, p0/m, z30.s, z31.s fmla z8.s, p0/m, z30.s, z31.s fmla z9.s, p0/m, z30.s, z31.s fmla z10.s, p0/m, z30.s, z31.s fmla z11.s, p0/m, z30.s, z31.s fmla z12.s, p0/m, z30.s, z31.s fmla z13.s, p0/m, z30.s, z31.s fmla z14.s, p0/m, z30.s, z31.s fmla z15.s, p0/m, z30.s, z31.s fmla z16.s, p0/m, z30.s, z31.s fmla z17.s, p0/m, z30.s, z31.s fmla z18.s, p0/m, z30.s, z31.s fmla z19.s, p0/m, z30.s, z31.s fmla z20.s, p0/m, z30.s, z31.s fmla z21.s, p0/m, z30.s, z31.s fmla z22.s, p0/m, z30.s, z31.s fmla z23.s, p0/m, z30.s, z31.s fmla z24.s, p0/m, z30.s, z31.s fmla z25.s, p0/m, z30.s, z31.s fmla z26.s, p0/m, z30.s, z31.s fmla z27.s, p0/m, z30.s, z31.s fmla z28.s, p0/m, z30.s, z31.s fmla z29.s, p0/m, z30.s, z31.s cbnz x0, loop_peak_sve_fmla_streaming smstop ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 mov x0, 30*32 ret .global _peak_sme_fmopa .align 4 _peak_sme_fmopa: stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! smstart ptrue p0.b ptrue p1.b loop_peak_sme_fmopa: sub x0, x0, #1 fmopa za0.s, p0/m, p1/m, z0.s, z1.s fmopa za1.s, p0/m, p1/m, z2.s, z3.s fmopa za2.s, p0/m, p1/m, z4.s, z5.s fmopa za3.s, p0/m, p1/m, z6.s, z7.s fmopa za0.s, p0/m, p1/m, z8.s, z9.s fmopa za1.s, p0/m, p1/m, z10.s, z11.s fmopa za2.s, p0/m, p1/m, z12.s, z13.s fmopa za3.s, p0/m, p1/m, z14.s, z15.s fmopa za0.s, p0/m, p1/m, z16.s, z17.s fmopa za1.s, p0/m, p1/m, z18.s, z19.s fmopa za2.s, p0/m, p1/m, z20.s, z21.s fmopa za3.s, p0/m, p1/m, z22.s, z23.s fmopa za0.s, p0/m, p1/m, z24.s, z25.s fmopa za1.s, p0/m, p1/m, z26.s, z27.s fmopa za2.s, p0/m, p1/m, z28.s, z29.s fmopa za3.s, p0/m, p1/m, z30.s, z31.s fmopa za0.s, p0/m, p1/m, z0.s, z1.s fmopa za1.s, p0/m, p1/m, z2.s, z3.s fmopa za2.s, p0/m, p1/m, z4.s, z5.s fmopa za3.s, p0/m, p1/m, z6.s, z7.s fmopa za0.s, p0/m, p1/m, z8.s, z9.s fmopa za1.s, p0/m, p1/m, z10.s, z11.s fmopa za2.s, p0/m, p1/m, z12.s, z13.s fmopa za3.s, p0/m, p1/m, z14.s, z15.s fmopa za0.s, p0/m, p1/m, z16.s, z17.s fmopa za1.s, p0/m, p1/m, z18.s, z19.s fmopa za2.s, p0/m, p1/m, z20.s, z21.s fmopa za3.s, p0/m, p1/m, z22.s, z23.s fmopa za0.s, p0/m, p1/m, z24.s, z25.s fmopa za1.s, p0/m, p1/m, z26.s, z27.s fmopa za2.s, p0/m, p1/m, z28.s, z29.s fmopa za3.s, p0/m, p1/m, z30.s, z31.s cbnz x0, loop_peak_sme_fmopa smstop ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 mov x0, 32*512 ret .global _example_sme_fmopa .align 4 _example_sme_fmopa: smstart ptrue p0.b ldr z0, [x0] ldr z1, [x1] fmopa za0.s, p0/m, p0/m, z0.s, z1.s mov w12, #0 mov x3, #16 loop_example_sme_fmopa: str za[w12, #0], [x2] add w12, w12, #4 add x2, x2, #16*4 sub x3, x3, #1 cbnz x3, loop_example_sme_fmopa smstop ret