for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
c[i][j] = c[i][j]
+ a[i][k]*b[k][j];
}
}
}
stokes# cc mm.c
stokes# time a.out
256 5.537035
6.09u 0.03s 0:06.14 99.6%
stokes#
stokes# cc -xO3 mm.c
stokes# !t
time a.out
256 16.448251
2.05u 0.02s 0:02.07 100.0%
stokes#
N = 257
stokes# !cc
cc -xO3 mm.c
stokes# !t
time a.out
257 17.774443
1.93u 0.01s 0:01.96 98.9%
N = 512
stokes# !cc
cc -xO3 mm.c
stokes# !t
time a.out
512 10.943149
24.61u 0.07s 0:24.69 99.9%
stokes#
N = 513
stokes# !cc
cc -xO3 mm.c
stokes# !t
time a.out
513 12.217710
22.16u 0.06s 0:22.44 99.0%
stokes#
N = 513
stokes# cc -xtarget=ultra2 -xcache=16/32/1:4096/64/1
-xO3 mm.c
stokes# !t
time a.out
513 12.239864
22.12u 0.07s 0:22.20 99.9%
stokes#
stokes# cc -fast mm.c
stokes# !t
time a.out
513 17.682475
15.33u 0.07s 0:15.41 99.9%
stokes# setenv PARALLEL 4
stokes# cc -xparallel mm.c
cc: Warning: Optimizer level changed from 0 to 3 to support parallelized
code.
stokes# !t
time a.out
513 45.152407
6.11u 0.06s 0:01.87 329.9%
stokes#
stokes# cc -xO3 -xparallel -xloopinfo mm.c
"mm.c", line 15: not parallelized, unsafe dependence (fi)
"mm.c", line 18: not parallelized, unsafe dependence (fj)
"mm.c", line 29: not parallelized, unsafe dependence (fi)
"mm.c", line 32: not parallelized, unsafe dependence (fj)
"mm.c", line 39: not parallelized, unsafe dependence (b)
"mm.c", line 41: PARALLELIZED, and serial version generated
"mm.c", line 49: PARALLELIZED
"mm.c", line 51: not parallelized, not profitable
"mm.c", line 53: not parallelized, unsafe dependence (c)
513 44.629982
6.15u 0.08s 0:01.87 333.1%
for(k=0;k<N;k++)
{
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
c[i][j] = c[i][j]
+ a[i][k]*b[k][j];
}
}
}
stokes# cc -xO3 mm.c
stokes# !t
time a.out
513 64.906585
4.22u 0.06s 0:04.29
99.7%