资源描述
,单击此处编辑母版文本样式,第二级,第三级,第四级,第五级,*,单击此处编辑母版标题样式,论程序底层优化的一些方法与技巧,成都七中 骆可强,T(n,)=,O(,f(n,),),效率,时间效率,=+,空间效率,算法!,T(n,),f(n,),C,C1C2,算法,底层优化,?,/,最初版程序,int,get_max(,int,*a,int,l),int,mx=,0,i;,for,(i=,0,;imx)mx=ai;,return,mx;,/,第一次优化,int,get_max(,int,*a,int,l),int,mx=,0,*ed=a+l;,while,(a!=ed),if,(*amx)mx=*a;,a+;,return,mx;,/,第二次优化,int,get_max(,int,*a,int,l),assert(l%,8,=,0,);,#define D(x)mx#x=0,int,D(,0,),D(,1,),D(,2,),D(,3,),D(,4,),D(,5,),D(,6,),D(,7,),*ed=a+l;,#define CMP(x)if(*(a+x)mx#x)mx#x=*(a+x);,while,(a!=ed),CMP(,0,);CMP(,1,);,CMP(,2,);CMP(,3,);,CMP(,4,);CMP(,5,);,CMP(,6,);CMP(,7,);,a+=,8,;,#define CC(x1,x2)if(mx#x1mx#x2)mx#x2=mx#x1;,CC(,1,0,);CC(,3,2,);,CC(,5,4,);CC(,7,6,);,CC(,2,0,);CC(,6,4,);,CC(,4,0,);,return,mx0;,/,第三次优化,int,get_max(,int,*a,int,l),int,ret;,_asm_ _volatile_(,movl$0,%eaxnt,.p2align 4,15n,LP1:nt,cmpl -4(%1,%2,4),%eaxnt,jge EDnt,movl -4(%1,%2,4),%eaxn,ED:nt,/loop LP1nt,decl%2nt,jnz LP1nt,movl%eax,%0nt,:,=m,(ret),:,r,(a),c,(l),:,%eax,);,return,ret;,/,第四个优化,int,get_max(,int,*a,int,l),assert(l%,2,=,0,);,int,ret;,_asm_ _volatile_(,movl$0,%eaxnt,movl$0,%edxnt,.p2align 4,15n,LP2:nt,cmpl (%1),%eaxnt,jge ED2nt,movl (%1),%eaxn,ED2:nt,cmpl 4(%1),%edxnt,jge ED3nt,movl 4(%1),%edxn,ED3:nt,addl$8,%1nt,subl$2,%2nt,jnz LP2nt,cmpl%edx,%eaxnt,cmovll%edx,%eaxnt,movl%eax,%0nt,:,=m,(ret),:,r,(a),r,(l),:,%eax,%edx,);,return,ret;,/,第五次优化,int,get_max(,int,*a,int,l),assert(l%,4,=,0,);,assert(sse2);,int,ret,tmp,4,;,_asm_ _volatile_(,txorps%xmm0,%xmm0n,LP3:n,tmovdqa%xmm0,%xmm1n,tpcmpgtd (%1),%xmm1n,tandps%xmm1,%xmm0n,tandnps (%1),%xmm1n,torps%xmm1,%xmm0n,taddl$16,%1n,tsubl$4,%2n,tjnz LP3n,tmovdqu%xmm0,(%3)n,tmovl (%3),%eaxn,tcmpl 4(%3),%eaxn,tcmovll 4(%3),%eaxn,tcmpl 8(%3),%eaxn,tcmovll 8(%3),%eaxn,tcmpl 12(%3),%eaxn,tcmovll 12(%3),%eaxn,tmovl%eax,%0n,:,=m,(ret),:,r,(a),r,(l),r,(tmp),:,%eax,);,return,ret;,/,第六次优化,int,get_max(,int,*a,int,l),assert(l%,4,=,0,);,assert(sse4);,int,ret,tmp,4,;,_asm_ _volatile_(,txorps%xmm0,%xmm0n,LP4:n,tpmaxsd (%1),%xmm0n,taddl$16,%1n,tsubl$4,%2n,tjnz LP4n,tmovdqu%xmm0,(%3)n,tmovl (%3),%eaxn,tcmpl 4(%3),%eaxn,tcmovll 4(%3),%eaxn,tcmpl 8(%3),%eaxn,tcmovll 8(%3),%eaxn,tcmpl 12(%3),%eaxn,tcmovll 12(%3),%eaxn,tmovl%eax,%0n,:,=m,(ret),:,r,(a),r,(l),r,(tmp),:,%eax,);,return,ret;,一个简单的例子,编号,平均,时钟周期,优化,(%),优化方法,7,.,53,6,.,60,12%,3,.,48,54%,2,.,13,72%,1,.,74,77%,1,.,5,9,80%,?,?,优化寻址,多路求值,内嵌汇编,内嵌汇编,+,多路求值,SIMD,SSE4,:,求最大值,论文中所覆盖的主题,CPU,指令运行的效率表现,CPU,优化特性,数值运算的优化,高维数组的使用,位运算技巧,除法,高精度,乘法,缓存机制,乱序执行,分支预测,位压缩,消除分支,打包统计,寻址,底层表现,浮点,除常数,高维数组访问的底层表现,A,1,1,A,1,2,A,1,2048,A,2,1,int,a,2048,2048,;,int,main(),int,i,x,y;,for,(i=,1,;i=,100,;i+),for,(,x,=,0,;,x,2048,;,x,+),for,(,y,=,0,;,y,2048,;,y,+),axy=x+y;,return,0,;,int,a,2048,2048,;,int,main(),int,i,x,y;,for,(i=,1,;i=,100,;i+),for,(y=0;y2048;y+),for,(x=0;x2048;x+),axy=x+y;,return,0,;,int,a,204,9,204,9,;,int,main(),int,i,x,y;,for,(i=,1,;i=,100,;i+),for,(y=,0,;y,204,9,;y+),for,(x=,0,;x,35,a/=7,除以常数比除以变量更快,在某些场合,我们需要多次除以同一个变量,也可,以,事先计算出这些常数起到优化效果,ai,=i%2,ai,=rand()%2,CPU,的分支预测机制,x=(x=a?b:a),int,abs(,int,x),return,x,0,?x:-x;,int,cmp(,int,a),if,(!a),return,0,;,return,a,0,?,-1,:,1,;,int,t=,0,;,for(int,i=,0,;i31)+(-a31,int,abs(,int,x),int,y=x31;,return,(x+y)y;,x=ab,0,1,0,1,0,1,0,测量结果,1,:,3.210,7,个时钟周期,测量结果,2,:,1,.0210,8,个时钟周期,1,1,1,0,0,1,0,if,ai,else,消除条件分支,在信息学奥赛中的实践,题目:麦森数,(mason),来源:,NOIP2003,算法:朴素的高精度计算,测试情况:,优化:消除除法与条件分支,优化情况:,100,分,题目:瑰丽的华尔兹,(adv1900),来源:,NOI2005,算法:朴素的动态规划,,O(N,4,),测试情况:,优化:优化高维数组寻址,优化情况:,题目:,翻译玛雅著作,(writing),来源:,IOI2006,算法:,O(,字符集,串长,),的枚举,测试情况:,优化:汇编优化循环与数组访问,优化情况:,70,分,100,分(均在,0.5s,内出解),60,分,70,分,100,分,总结,过早的优化是,效率低下的根源,K,eep,I,t,S,imple and,S,tupid,程序的优化是无止境的,谢谢大家欢迎提问,
展开阅读全文