主题:[原创]WinXP下CVF和IVF编译出的Lapack库的执行速度比较(附测试程序)
测试环境:
硬件:
Core Duo2400(1.8GHZ)CPU,1G 533内存
软件:
Windows XP SP2, Compaq Visual Fortran6.6A(CVF), Intel Visual Fortran9.1(IVF),Lapack3.1
共编译了三个版本的Lapack库:
1、用CVF编译,主要优化参数为OPTS = /optimize:5 以下简称CVF库
2、用IVF编译,主要优化参数为OPTS = /O3 /QaxN /QxN /Qparallel 以下简称IVF_N库
3、用IVF编译,主要优化参数为OPTS = /O3 /QaxP /QxP /Qparallel 以下简称IVF_P库
上面第一个版本的库没有针对CPU进行优化;第二个版本的库可以用于一般的Intel P4处理器,第三种库可用于Intel双核CPU或有SSE3指令的Intel P4 CPU
共测试了两个程序,第一个程序是通过SGETRF对方程组分解后,再用SGETRS求解,并检查误差;第二个程序是通过SPBTRF对具有带状分布的稀疏方程组进行分解,然后再用SPBTRS对其求解。程序代码如下:
program sgetrf_test
use dfport !imsl
!DEC$ OBJCOMMENT LIB:'blas_WIN32.lib'
!DEC$ OBJCOMMENT LIB:'lapack_WIN32.lib'
!DEC$ OBJCOMMENT LIB:'tmglib_WIN32.lib'
character*1 trans
integer i,j,m,n,chkunit,info,lda
integer,allocatable::ipiv(:)
real,allocatable::a(:,:),b(:) ,x(:)
real time1,time2,error
!external SGETRF
chkunit=1
temunit=2
open(chkunit,file='abc.chk')
!n=1000
1000 write(*,*)'input n='
read(*,*)n
if(n<=0)stop
m=n
lda=max(1,m)
open(temunit,file='abc.tem',FORM='UNFORMATTED')
allocate(a(n,n),b(n),x(n),ipiv(n))
a=0.;b=0.
!a=rand(a)
!b=rand(b)
do i=1,n
do j=i+1,n
a(i,j)=random(0)
a(j,i)=a(i,j)
end do
a(i,i)=random(0)*5
b(i)=random(0)*5
end do
write(temunit)a
write(temunit)b
call cpu_time(time1)
call SGETRF( M, N, A, LDA, IPIV, INFO )
call cpu_time(time2)
write(chkunit,*)'n and factorizing time=',n,time2-time1
trans='N';nrhs=1;ldb=lda
call SGETRS( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO )
x=b
rewind(temunit)
read(temunit)a
read(temunit)b
b=matmul(a,x)-b
error=dot_product(b,b)
write(chkunit,*)'error=',error
deallocate(a,b,x,ipiv)
close(temunit)
goto 1000
end program sgetrf_test
---------------------------------------------------
program SPBTRF_test
use dfport !imsl
!DEC$ OBJCOMMENT LIB:'blas_WIN32.lib'
!DEC$ OBJCOMMENT LIB:'lapack_WIN32.lib'
!DEC$ OBJCOMMENT LIB:'tmglib_WIN32.lib'
character*1 trans,uplo,date1*25,date2*25
integer i,j,m,n,chkunit,info,lda,ldab,kd
integer,allocatable::ipiv(:)
real,allocatable::a(:,:),b(:) ,x(:)
real time1,time2,error
!external SGETRF
chkunit=1
temunit=2
open(chkunit,file='abc.chk')
!n=1000
1000 write(*,*)'input n='
read(*,*)n
if(n<=0)stop
m=n
kd=int(n/10)+2 !maxband
ldab=kd+1
open(temunit,file='abc.tem',FORM='UNFORMATTED')
allocate(a(ldab,n),b(n),x(n))
a=0.;b=0.;x=0.
!a=rand(a)
!b=rand(b)
uplo='L'
call cpu_time(time1)
do j=1,n
do i=2,ldab
a(i,j)=i+j
end do
a(1,j)=sum(a(2:ldab,j))
b(j)=random(0)*5
end do
call cpu_time(time2)
write(chkunit,*)'generation matrix time=',time2-time1
call cpu_time(time1)
call SPBTRF( UPLO, N, KD, a, LDAB, INFO )
call cpu_time(time2)
write(chkunit,*)'n,maxband and factorizing time=',n,ldab,time2-time1
nrhs=1;ldb=n
call SPBTRS( UPLO, N, KD, NRHS, a, LDAB, B, LDB, INFO )
deallocate(a,b,x)
close(temunit)
goto 1000
2000 format(50f8.3)
end program SPBTRF_test
测试结果如下
SGETRF:n为方程阶数,其后为计算时间,error后为计算误差
n and factorizing time= 1000 0.5625000
error= 1.5267158E-04
n and factorizing time= 2000 4.375000
error= 0.1836250
n and factorizing time= 3000 14.56250
error= 1.9454738E-02
n and factorizing time= 4000 34.32812
error= 5.8565985E-02
n and factorizing time= 5000 67.81250
error= 6.5416552E-02
IVF_N
n and factorizing time= 1000 0.2812500
error= 2.3405529E-04
n and factorizing time= 2000 2.234375
error= 4.2834733E-02
n and factorizing time= 3000 7.484375
error= 4.1005041E-02
n and factorizing time= 4000 17.43750
error= 3.5797328E-02
n and factorizing time= 5000 36.84375
error= 0.8729380
IVF_P
n and factorizing time= 1000 0.3125000
error= 2.3405529E-04
n and factorizing time= 2000 2.500000
error= 4.2834733E-02
n and factorizing time= 3000 8.109375
error= 4.1005041E-02
n and factorizing time= 4000 19.00000
error= 3.5797328E-02
n and factorizing time= 5000 40.95312
error= 0.8729380
SPBTRF n为方程阶数,maxband为方程半带宽,即第一个非0元素到主对角线之间的元素个数,其后为计算时间。
CVF
generation matrix time= 6.2500000E-02
n,maxband and factorizing time= 10000 1003 7.687500
generation matrix time= 0.1718750
n,maxband and factorizing time= 20000 2003 60.01562
generation matrix time= 0.4218750
n,maxband and factorizing time= 30000 3003 200.8750
IVF_N
generation matrix time= 4.6875000E-02
n,maxband and factorizing time= 10000 1003 4.375000
generation matrix time= 0.1875000
n,maxband and factorizing time= 20000 2003 33.40625
generation matrix time= 0.4375000
n,maxband and factorizing time= 30000 3003 109.9375
IVF_P
generation matrix time= 4.6875000E-02
n,maxband and factorizing time= 10000 1003 6.390625
generation matrix time= 0.1875000
n,maxband and factorizing time= 20000 2003 49.01562
generation matrix time= 0.4375000
n,maxband and factorizing time= 30000 3003 164.2344
结论:
1、从两个程序在调用三个不同编译环境得到的Lapack的情况来看,IVF所得的库效率最高,CVF相对差点儿;
2、在IVF的两个库的运行情况来看,IVF_N所得的效率较IVF_P所得的效率要高。所不同的时,我发现在运行IVF_P编译的Lapack库时CPU是以100%的速度运行,即两个CPU都在运行;而当运行IVF_N编译的Lapack库时CPU是以50%的速度运行,即只有一个CPU在运行。
讨论:
1、为什么两个CPU运行的效率还不如一个CPU运行的效率?
2、一般我在用IVF编译程序时,即使用了/QaxP /QxP等优化参数后,运行程序时CPU都还是以50%即一个核心在运行。而用这个优化参数编译Lapack库后运行的程序是两个核心都在运行。难道Lapack里运行了什么技术?
需要说明的是以上测试时没有使用虚拟内存,因此,不存在读写硬盘所用的时间。