主题:Fortran在自动并行时出错了,请高手支招,急急...
cxcctv [专家分:240] 发布于 2008-05-01 09:41:00
我想在主函数里二个循环实现自动并行,我用的是IVF9.1,Openmp2.5,我的CPU是Intel (R) Core (TM)2 Duo ,我更改了二处属性:
1. Fortran--optimization--parallelizaion==yes,threshold for auto-parallelization==0
2. language--process openmp direcives==generate parallel code
可是一直出现如下的错误:
1.当在配置管理器的Debug改成win32 时,出现了ifort: Command line warning: ignoring option '/Qopenmp'; no argument required
2 当我把把配置管理器的Debug改成X64时,它又出现了:Intel(R) Fortran Compiler for 'x64 not installed' 可我我用的XP64位系统,这是为何?请高手支招!!
我的主函数为:
program p70
use hanshu !这是一个函数模块
use omp_lib
implicit none
integer::nels,nxe,neq,nn,nr,nip=4,nodof=1,nod=4,ndof=4,loaded_nodes,i,k, &
iel,ndim=2,fixed_nodes
real::det,aa,bb ; character (len=15) :: element = 'quadrilateral'
!----------------------------- dynamic arrays----------------------------------
real,allocatable::kv(:),kvh(:),loads(:),points(:,:),coord(:,:),jac(:,:), &
der(:,:),deriv(:,:),weights(:),kp(:,:),g_coord(:,:), &
value(:),kay(:,:),disps(:),perms(:)
integer,allocatable::nf(:,:),g(:),num(:),g_num(:,:),g_g(:,:),kdiag(:), &
node(:),no(:)
!-------------------------input and initialisation----------------------------
open(10,file='p70.dat',status = 'old', action = 'read')
open(11,file='water result.txt',status='replace', action = 'write')
read (10,*)nels,nxe,aa,bb; nn=(nxe+1)*(nels/nxe+1)
allocate(nf(nodof,nn),points(nip,ndim),g(ndof),g_coord(ndim,nn), &
coord(nod,ndim),jac(ndim,ndim),weights(nip),der(ndim,nod), &
deriv(ndim,nod),kp(ndof,ndof),num(nod),g_num(nod,nels), &
g_g(ndof,nels),kay(ndim,ndim),perms(ndim))
call timestamp
read(10,*)perms
kay=0.0; do i=1,ndim; kay(i,i)=perms(i); end do
read(10,*)nr
nf=1; if(nr>0)read(10,*)(k,nf(:,k),i=1,nr); call formnf(nf); neq=maxval(nf)
allocate (kdiag(neq))
! ----------loop the elements to set up global geometry and kdiag -------------
kdiag=0
elements_1: do iel=1,nels
call geometry_4qx(iel,nxe,aa,bb,coord,num)
g_num(:,iel)=num
g_coord(:,num)=transpose(coord)
call num_to_g (num, nf , g ); g_g(:,iel)=g; call fkdiag(kdiag,g)
end do elements_1
write(11,'(a)')"Global coordinates"
do k = 1 , nn
write(11,'(a,i5,a,3e12.4)')"Node ",k," ",g_coord(:,k); end do
write(11,'(a)')"Global node numbers"
do k=1,nels
write(11,'(a,i5,a,27i3)')"Element ",k," ",g_num(:,k); end do
kdiag(1)=1; do i=2,neq; kdiag(i)=kdiag(i)+kdiag(i-1); end do
write(11,'(2(a,i5),/)') &
"There are ",neq," equations and the skyline storage is ",kdiag(neq)
allocate(kv(kdiag(neq)),kvh(kdiag(neq)),loads(0:neq),disps(0:neq))
kv=0.0; loads=0.0
call sample(element,points,weights)
!------- element conductivity integration and assembly--------------------
elements_2: do iel=1,nels
kp= 0.0
num=g_num(:,iel); coord=transpose(g_coord(:,num)); g=g_g(:,iel)
integrating_pts_1: do i=1,nip
call shape_der(der,points,i); jac=matmul(der,coord)
det=determinant(jac); call invert(jac); deriv=matmul(jac,der)
kp=kp+matmul(matmul(transpose(deriv),kay),deriv)*det*weights(i)
end do integrating_pts_1
call fsparv (kv,kp,g,kdiag)
end do elements_2
kvh=kv
read (10,*)loaded_nodes
if(loaded_nodes/=0)read(10,*)(k,loads(nf(:,k)),i=1,loaded_nodes)
read (10,*)fixed_nodes
if(fixed_nodes/=0)then
allocate(node(fixed_nodes),no(fixed_nodes),value(fixed_nodes))
read(10,*)(node(i),value(i),i=1,fixed_nodes)
do i=1,fixed_nodes; no(i)=nf(1,node(i)); end do
kv(kdiag(no))=kv(kdiag(no))+1.e20; loads(no)=kv(kdiag(no))*value
end if
!--------------------------equation solution-----------------------------------
call sparin(kv,kdiag); call spabac(kv,loads,kdiag)
!------------------------ retrieve flow rate ---------------------------------
call linmul_sky(kvh,loads,disps,kdiag)
write(11,'(a)')"The nodal values are:"
write(11,'(a)')" Potentials Flow rates"
do k=1,nn
write(11,'(i5,a,2f12.2)')k," ",loads(nf(1,k)),disps(nf(1,k)); end do
write(11,'(a)')" Inflow Outflow"
write(11,'(2f12.2)')sum(disps(1:),mask=disps<0.),sum(disps(1:),mask=disps>0.)
call timestamp
end program p70
最后更新于:2008-05-01 09:43:00
回复列表 (共9个回复)
沙发
cxcctv [专家分:240] 发布于 2008-05-01 09:48:00
我的模块函数是:
module hanshu
contains
subroutine formnf(nf)
! reform nf
implicit none
integer,intent(in out)::nf(:,:)
integer:: i,j,m
m=0
do j=1,ubound(nf,2)
do i=1,ubound(nf,1)
if(nf(i,j)/=0) then
m=m+1; nf(i,j)=m
end if
end do
end do
return
end subroutine formnf
subroutine geometry_4qx(iel,nxe,aa,bb,coord,num)
! coordinates and nodal vectors for equal four node quad
! elements, numbering in x
implicit none
integer,intent(in)::iel,nxe; real,intent(in)::aa,bb
real,intent(out)::coord(:,:); integer,intent(out)::num(:)
integer :: ip,iq ; iq=(iel-1)/nxe+1; ip=iel-(iq-1)*nxe
num=(/iq*(nxe+1)+ip,(iq-1)*(nxe+1)+ip, &
(iq-1)*(nxe+1)+ip+1, iq*(nxe+1)+ip+1/)
coord(1:2,1)=(ip-1)*aa; coord(3:4,1)=ip*aa
coord(1:4:3,2)=-iq*bb; coord(2:3,2)=-(iq-1)*bb
return
end subroutine geometry_4qx
subroutine num_to_g(num,nf,g)
!finds the g vector from num and nf
implicit none
integer,intent(in)::num(:),nf(:,:) ; integer,intent(out):: g(:)
integer::i,k,nod,nodof ; nod=ubound(num,1) ; nodof=ubound(nf,1)
do i = 1 , nod
k = i*nodof ; g(k-nodof+1:k) = nf( : , num(i) )
end do
return
end subroutine num_to_g
subroutine fkdiag(kdiag,g)
! finds the maximum bandwidth for each freedom
implicit none
integer,intent(in)::g(:); integer,intent(out)::kdiag(:)
integer::idof,i,iwp1,j,im,k,x
idof=size(g)
do i = 1,idof
iwp1=1
if(g(i)/=0) then
do j=1,idof
if(g(j)/=0) then
im=g(i)-g(j)+1
if(im>iwp1) iwp1=im
end if
end do
k=g(i); if(iwp1>kdiag(k))kdiag(k)=iwp1
end if
end do
return
end subroutine fkdiag
subroutine sample(element,ps,ws)
implicit none
character(*),intent(in) ::element
real,intent(out) :: ps(:,:) ,ws(:)
real xi
xi=1./sqrt(3.)
if(element =='quadrilateral') then
ps(1,1)=-xi;ps(1,2)=xi
ps(2,1)=xi ;ps(2,2)=-xi
ps(3,1)=-xi;ps(3,2)=xi
ps(4,1)=xi ;ps(4,2)=-xi
ws=1.0
end if
return
end subroutine sample
subroutine shape_der(der,ps,k)
implicit none
real,intent(out):: der(:,:)
real,intent(in) :: ps(:,:)
integer,intent(in) ::k
real :: x1,x2,x3,x4
x1=0.25*(1.-ps(k,2))
x2=0.25*(1.+ps(k,2))
x3=0.25*(1.-ps(k,1))
x4=0.25*(1.+ps(k,1))
der(1,1)=-x1; der(1,2)=-x2; der(1,3)=x2; der(1,4)=x1
der(2,1)=-x3; der(2,2)=x3; der(2,3)=x4; der(2,4)=-x4
return
end subroutine shape_der
function determinant (jac) result(det)
! returns the determinant of a 1x1 2x2 3x3 jacobian matrix
implicit none ; real :: det
real,intent(in)::jac(:,:); integer:: it ; it = ubound(jac,1)
select case (it)
case (1)
det=1.0
case (2)
det=jac(1,1)*jac(2,2) - jac(1,2) * jac(2,1)
case (3)
det= jac(1,1)*(jac(2,2) * jac(3,3) -jac(3,2) * jac(2,3))
det= det-jac(1,2)*(jac(2,1)*jac(3,3)-jac(3,1)*jac(2,3))
det= det+jac(1,3)*(jac(2,1)*jac(3,2)-jac(3,1)*jac(2,2))
case default
print*,' wrong dimension for jacobian matrix'
end select
return
end function determinant
板凳
cxcctv [专家分:240] 发布于 2008-05-01 09:49:00
接上面的:subroutine invert(matrix)
! invert a small square matrix onto itself
implicit none
real,intent(in out)::matrix(:,:)
integer::i,k,n; real::con ; n= ubound(matrix,1)
do k=1,n
con=matrix(k,k); matrix(k,k)=1.
matrix(k,:)=matrix(k,:)/con
do i=1,n
if(i/=k) then
con=matrix(i,k); matrix(i,k)=0.0
matrix(i,:)=matrix(i,:) - matrix(k,:)*con
end if
end do
end do
return
end subroutine invert
subroutine fsparv(bk,km,g,kdiag)
! assembly 装配of element 单元matrices into 整体刚度矩阵skyline global matrix
implicit none
real,intent(in)::km(:,:); integer,intent(in)::g(:),kdiag(:)
real,intent(out)::bk(:) ; integer::i,idof,k,j,iw,ival
idof=ubound(g,1)
do i=1,idof
k=g(i)
if(k/=0) then
do j=1,idof
if(g(j)/=0) then
iw=k-g(j)
if(iw>=0) then
ival=kdiag(k)-iw
bk(ival)=bk(ival)+km(i,j)
end if
end if
end do
end if
end do
return
end subroutine fsparv
subroutine sparin(a,kdiag)
! Choleski factorisation of variable bandwidth matrix a
! stored as a vector and overwritten
implicit none
real,intent(in out)::a(:);integer,intent(in)::kdiag(:)
integer::n,i,ki,l,kj,j,ll,m,k; real::x
n=ubound(kdiag,1) ; a(1)=sqrt(a(1))
do i=2,n
ki=kdiag(i)-i; l=kdiag(i-1)-ki+1
do j=l,i
x=a(ki+j); kj=kdiag(j)-j
if(j/=1) then
ll=kdiag(j-1)-kj+1; ll=max0(l,ll)
if(ll/=j) then
m=j-1
do k=ll,m ; x=x-a(ki+k)*a(kj+k) ; end do
end if
end if
a(ki+j)=x/a(kj+j)
end do
a(ki+i)=sqrt(x)
end do
return
end subroutine sparin
subroutine spabac(a,b,kdiag)
! Choleski forward and backward substitution combined
! variable bandwidth factorised matrix a stored as a vector
implicit none
real,intent(in)::a(:);real,intent(in out)::b(0:);integer,intent(in)::kdiag(:)
integer::n,i,ki,l,m,j,it,k; real::x
n=ubound(kdiag,1)
b(1)=b(1)/a(1)
do i=2,n
ki=kdiag(i)-i; l=kdiag(i-1)-ki+1 ; x=b(i)
if(l/=i) then
m=i-1
do j=l,m ; x=x-a(ki+j)*b(j); end do
end if
b(i)=x/a(ki+i)
end do
do it=2,n
i=n+2-it; ki=kdiag(i)-i; x=b(i)/a(ki+i); b(i)=x; l=kdiag(i-1)-ki+1
if(l/=i) then
m=i-1
do k=l,m; b(k)=b(k)-x*a(ki+k); end do
end if
end do
b(1)=b(1)/a(1)
return
end subroutine spabac
3 楼
cxcctv [专家分:240] 发布于 2008-05-01 09:49:00
接上:
subroutine linmul_sky(bp,disps,loads,kdiag)
! skyline product of symmetric matrix and a vector
implicit none
real,intent(in)::bp(:),disps(0:);real,intent(out)::loads(0:)
integer,intent(in)::kdiag(:); integer::n,i,j,low,lup,k; real::x
n=ubound(disps,1)
do i = 1 , n
x = .0 ; lup=kdiag(i)
if(i==1)low=lup; if(i/=1)low=kdiag(i-1)+1
do j = low , lup
x = x + bp(j) * disps(i + j - lup)
end do
loads(i) = x
if(i == 1) cycle ; lup = lup - 1
do j = low , lup
k = i + j -lup - 1
loads(k) = loads(k) + bp(j)*disps(i)
end do
end do
return
end subroutine linmul_sky
subroutine timestamp ( )
implicit none
character ( len = 8 ) ampm
integer d
character ( len = 8 ) date
integer h
integer m
integer mm
character ( len = 9 ), parameter, dimension(12) :: month = (/ &
'January ', 'February ', 'March ', 'April ', &
'May ', 'June ', 'July ', 'August ', &
'September', 'October ', 'November ', 'December ' /)
integer n
integer s
character ( len = 10 ) time
integer values(8)
integer y
character ( len = 5 ) zone
call date_and_time ( date, time, zone, values )
y = values(1)
m = values(2)
d = values(3)
h = values(5)
n = values(6)
s = values(7)
mm = values(8)
if ( h < 12 ) then
ampm = 'AM'
else if ( h == 12 ) then
if ( n == 0 .and. s == 0 ) then
ampm = 'Noon'
else
ampm = 'PM'
end if
else
h = h - 12
if ( h < 12 ) then
ampm = 'PM'
else if ( h == 12 ) then
if ( n == 0 .and. s == 0 ) then
ampm = 'Midnight'
else
ampm = 'AM'
end if
end if
end if
write ( *, '(a,1x,i2,1x,i4,2x,i2,a1,i2.2,a1,i2.2,a1,i3.3,1x,a)' ) &
trim ( month(m) ), d, y, h, ':', n, ':', s, '.', mm, trim ( ampm )
return
end subroutine timestamp
4 楼
f2003 [专家分:7960] 发布于 2008-05-01 12:26:00
1,你的程序里没有使用openmp,除了use了一下,其他什么都没有。所以ivf说ignore。
大概是这样吧。
可以做个实验,里面加个简单omp do,看看还报不报。
2,你没有安装ivf的64位版本。
5 楼
cxcctv [专家分:240] 发布于 2008-05-01 15:00:00
好的,我等会把IVF卸掉再装一下,当初我也觉得装得有点问题!
6 楼
cxcctv [专家分:240] 发布于 2008-05-01 17:03:00
f2003,你好,我重装了IVF9.1,能够运行Debug里的X64和win32,而且在BuildLog.htm时的提示是:
------ Build started: Project: water, Configuration: Debug|Win32 ------
Compiling with Intel Fortran 9.1 C:\Program Files (x86)\Intel\Compiler\Fortran\9.1\IA32\...
ifort /nologo /Zi /O3 /QaxT /QxT /Qparallel /Qpar_threshold:0 /assume:buffered_io /Qopenmp /module:"Debug/" /object:"Debug/" /traceback /check:bounds /libs:static /threads /dbglibs /c /Qvc8 /Qlocation,link,"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" "D:\fortran\water\water\water.F90"
D:\fortran\water\water\water.F90(315) : (col. 2) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(317) : (col. 2) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(317) : (col. 67) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(320) : (col. 2) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(341) : (col. 2) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(341) : (col. 10) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(345) : (col. 10) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(346) : (col. 28) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(348) : (col. 48) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(349) : (col. 60) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(354) : (col. 2) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(315) : (col. 2) remark: LOOP WAS VECTORIZED.
D:\fortran\water\water\water.F90(317) : (col. 2) remark: LOOP WAS VECTORIZED.
D:\fortran\water\water\water.F90(317) : (col. 67) remark: LOOP WAS VECTORIZED.
D:\fortran\water\water\water.F90(345) : (col. 10) remark: LOOP WAS VECTORIZED.
D:\fortran\water\water\water.F90(26) : (col. 4) remark: LOOP WAS AUTO-PARALLELIZED.
D:\fortran\water\water\water.F90(76) : (col. 17) remark: LOOP WAS AUTO-PARALLELIZED.
water build succeeded.
现在有几个问题:
1. 我用omp_get_wtime()测试了一下运行的时间
win32 x64
自动并行选项关闭时: 0.00220 0.00146
自动并行选项开启时 0.00195 0.00146
在工程上,很多应用程序是串行程序,如果我把这些程序改并行的,应该能提高效率,基于这个目的,我编写了这个串行代码,然后试着把它改成并行,可是手动插入并行代码时出现了很多错误,我调试了好长时间,可是并不理想,于是我想到了自动并行,并比较它们的差异,现在这个数据不知道能不能说明问题?并行成功了??
2. 可是照我原来的设计,它自动并行后会在自动在代码里的加上一些语句如:!$omp ...但是没有出现这些语句,不知道我这个设想对不对?
3 :为什么在这些代码的上面会有"Build started: Project: water, Configuration: Debug|Win32",我明明是调成x64,但是出现的是这个,不知道问题出在哪里.
4: 我在重新安装IVF9.1时,在安装说明时提示此软件包含有Intel (R)Extended memory 64 technology,请问这个是不是你所说的IVF64位的版本?
问题有点多啊,麻烦您啊!!
7 楼
f2003 [专家分:7960] 发布于 2008-05-01 22:10:00
4)对。ivf有3个版本,ia32,x64(也叫em64t)和ia64。 后两个是64位的,但ia64是指安腾处理器,平常很少见,与我们的pc机不兼容。所以一般说64位ivf版本就是x64的,其运行需要64位的windows。
2)你理解错了,自动并行不会修改源码的。自动并行是生成并行可执行二进制代码,也并非通过转化成omp程序来生成二进制代码,而是直接通过调用一些底层的线程库。omp是比较高层的并行方法。
程序员应该首选omp,而不是自动并行或者自行调用底层的线程库。
1)自动并行有苛刻的条件,速度平均有个15%-30%的提高。ivf10据说较大改善了自动并行,不过现阶段不要对编译器自动并行寄予太大希望。
3)我在linux下使用ifort,windows下不熟悉,我只能猜。你的win是32位的?还有,我听说ivf不是一个完整的编译器?连接器依靠vs,要是系统里没有64位的连接器,显然还是制造不了64位程序。我的说法不一定正确,听听其他人的意见。
我还补充几条:
5)每本并行优化书都可能提到:等串行程序编完了然后再并行化是一个错误的观点。那时候可能已经太复杂了。程序员从一开始就要think in parallel并行化思维。数据依赖性对并行化影响较大,要把无关的数据和操作分开,组织好数据,为并行打好基础。我觉得现代oop程序更易于并行化,而老的Fortran77要并行化反而不易,尤其是那些数据大量堆垒、操作复杂、大量使用common的程序。
6)有兴趣到这里看看,专门的并行讨论区。http://forum.csdn.net/SList/IntelMulti-core/UnClosedList/
7)你的程序多处已经多处自动并行化、矢量化(即使用了sse指令)了,与很多编译器相比已经不错了。我认为,只是我个人爱好,编译器就要最新版本的ivf,即使命令行也无所谓。然后再找个其它编译器如cvf,gfortran,pgi,帮助验证正确性。
8)你的编译选项中QaT可以去掉,如果你的程序只在自己的机器上运行不需要兼容其他cpu。
ivf编译选项里那个pgo优化有点意思,也能提高点速度,值得玩玩。
9)intel的一些资源:
intel优化白皮书http://www.intel.com/design/processor/manuals/248966.pdf
intel一些其他白皮书http://www.intel.com/cd/software/products/asmo-na/eng/338676.htm
isn论坛http://softwarecommunity.intel.com/isn/Community/en%2DUS/Forums/
10)我也是菜鸟,大家一起切磋,共同提高。
8 楼
cxcctv [专家分:240] 发布于 2008-05-02 08:51:00
谢谢你!原以为把串行写出来后改成并行就行,现在看来不行了,不能满足课题组的要求,看样子我的程序要动大手术了,过几天我把并行程序写出后大家共同讨论!!
9 楼
cxcctv [专家分:240] 发布于 2008-05-02 09:29:00
f2003大侠的这句话:"我听说ivf不是一个完整的编译器?连接器依靠vs,要是系统里没有64位的连接器,显然还是制造不了64位程序".
不知道有没有高手知道这个连接器? 多谢高手们留言!
[em2]
我来回复