苏军 · 2020年08月18日

关于V8指令集运行请教?

本人在A72上做一个数组累加:
s += a[i],测试的时候数组a是一个10KB的数组,循环100000次,保证整个过程的数据都是l1cache上,排除cachemiss的可能。
发现效率怎么都达不到最优,按理说这个数组每4个值需要一个LD1指令和一个ADD指令,按照F0/F1和L/S四个单元并行原则,理论上是1个cycle就可以搞定了,但实际上需要1.7cycle。
具体实现如下:

            "2:\n"
            "subs        x1    , x1,    #1\n"
            "ld1 {v8.4s},[x0],#16\n"
            "ld1 {v9.4s},[x0],#16\n"
            "ld1 {v10.4s},[x0],#16\n"
            "ld1 {v11.4s},[x0],#16\n"
            "ld1 {v12.4s},[x0],#16\n"
            "ld1 {v13.4s},[x0],#16\n"
            "add  v0.4s,v0.4s,v8.4s\n"
            "ld1 {v14.4s},[x0],#16\n"
            "add  v1.4s,v1.4s,v9.4s\n"
            "ld1 {v15.4s},[x0],#16\n"
            "add  v2.4s,v2.4s,v10.4s\n"                
            "ld1 {v16.4s},[x0],#16\n"
            "add  v3.4s,v3.4s,v11.4s\n"
            "ld1 {v17.4s},[x0],#16\n"
            "add  v4.4s,v4.4s,v12.4s\n"
            "ld1 {v18.4s},[x0],#16\n"
            "add  v5.4s,v5.4s,v13.4s\n"
            "ld1 {v19.4s},[x0],#16\n"
            "add  v6.4s,v6.4s,v14.4s\n"
            "ld1 {v20.4s},[x0],#16\n"
            "add  v7.4s,v7.4s,v15.4s\n"
            "ld1 {v21.4s},[x0],#16\n"
            "add  v24.4s,v24.4s,v16.4s\n"
            "ld1 {v22.4s},[x0],#16\n"
            "add  v25.4s,v25.4s,v17.4s\n"
            "ld1 {v23.4s},[x0],#16\n"
            "add  v26.4s,v26.4s,v18.4s\n"
            "add  v27.4s,v27.4s,v19.4s\n"
            "add  v28.4s,v28.4s,v20.4s\n"
            "add  v29.4s,v29.4s,v21.4s\n"
            "add  v30.4s,v30.4s,v22.4s\n"
            "add  v31.4s,v31.4s,v23.4s\n"
            "bne        2b\n"
            
            

第二种:

                "2:\n"
                "subs        x1    , x1,    #1\n"
                "ld1 { v8.4s},[x0],#16\n"
                "ld1 { v9.4s},[x0],#16\n"
                "ld1 {v10.4s},[x0],#16\n"
                "ld1 {v11.4s},[x0],#16\n"
                "ld1 {v12.4s},[x0],#16\n"
                "ld1 {v13.4s},[x0],#16\n"    
                "ld1 {v14.4s},[x0],#16\n"
                "ld1 {v15.4s},[x0],#16\n"
                 "add  v0.4s,v0.4s, v8.4s\n"
                "add  v1.4s,v1.4s, v9.4s\n"
                "add  v2.4s,v2.4s,v10.4s\n"
                "add  v3.4s,v3.4s,v11.4s\n"
                "add  v4.4s,v4.4s,v12.4s\n"
                "add  v5.4s,v5.4s,v13.4s\n"
                "add  v6.4s,v6.4s,v14.4s\n"
                "add  v7.4s,v7.4s,v15.4s\n"
                "bne        2b\n"

这两种的效率都是1.7cycle。
求解无法达到理论值的原因

你的回答