performance issure in Vc::Memory
Sandro Wenzel
[please enable javascript to see the address]
Wed Mar 12 20:36:24 CET 2014
Dear All,
By inspecting some assembly, I have recently made a surprising observation
on the behaviour of Vc::Memory.
The problem appears when I try to copy between Vc::Memory objects ( see
function bar2 ) and I realized that this copy is not using vector
instructions as I would have thought. In contrast, I have implement the
copy directly on vectors ( as in function bar4 ), I am getting the expected
assembly.
I compile the examples below with
g++-[4.7|4.8] -O3 -mavx ...
with Vc0.8
#include <Vc/Vc>
// result ok
void bar1( Vc::Vector<double> const & a, Vc::Vector<double> & b)
{
b=a;
}
// result surprising
void bar2( Vc::Memory<Vc::Vector<double>, 5> const & a,
Vc::Memory<Vc::Vector<double>, 5> & b)
{
b=a;
}
// result ok
void bar3( Vc::Memory<Vc::Vector<double>, 5> const & a,
Vc::Memory<Vc::Vector<double>, 5> & b)
{
b+=a;
}
// result ok
void bar4( Vc::Memory<Vc::Vector<double>, 5> const & a,
Vc::Memory<Vc::Vector<double>, 5> & b)
{
for( int i=0;i<1+5/Vc::Vector<double>::Size;++i )
{
Vc::Vector<double> tmp=a.vector(i); // this intermediate is forced by
some compilers because a direct assignment b.vector(i)=a.vector(i) is not
understood
b.vector(i) = tmp;
}
}
/// assembly follows:
0000000000000000 <_Z4bar1RKN2Vc3AVX6VectorIdEERS2_>:
0: c5 fd 28 07 vmovapd (%rdi),%ymm0
4: c5 fd 29 06 vmovapd %ymm0,(%rsi)
8: c5 f8 77 vzeroupper
b: c3 retq
c: 0f 1f 40 00 nopl 0x0(%rax)
0000000000000010 <_Z4bar2RKN2Vc6MemoryINS_3AVX6VectorIdEELm5ELm0EEERS4_>:
10: 48 8b 07 mov (%rdi),%rax
13: 48 89 06 mov %rax,(%rsi)
16: 48 8b 47 08 mov 0x8(%rdi),%rax
1a: 48 89 46 08 mov %rax,0x8(%rsi)
1e: 48 8b 47 10 mov 0x10(%rdi),%rax
22: 48 89 46 10 mov %rax,0x10(%rsi)
26: 48 8b 47 18 mov 0x18(%rdi),%rax
2a: 48 89 46 18 mov %rax,0x18(%rsi)
2e: 48 8b 47 20 mov 0x20(%rdi),%rax
32: 48 89 46 20 mov %rax,0x20(%rsi)
36: 48 8b 47 28 mov 0x28(%rdi),%rax
3a: 48 89 46 28 mov %rax,0x28(%rsi)
3e: 48 8b 47 30 mov 0x30(%rdi),%rax
42: 48 89 46 30 mov %rax,0x30(%rsi)
46: 48 8b 47 38 mov 0x38(%rdi),%rax
4a: 48 89 46 38 mov %rax,0x38(%rsi)
4e: c3 retq
4f: 90 nop
0000000000000050 <_Z4bar3RKN2Vc6MemoryINS_3AVX6VectorIdEELm5ELm0EEERS4_>:
50: c5 fd 28 06 vmovapd (%rsi),%ymm0
54: c5 fd 58 07 vaddpd (%rdi),%ymm0,%ymm0
58: c5 fd 29 06 vmovapd %ymm0,(%rsi)
5c: c5 fd 28 46 20 vmovapd 0x20(%rsi),%ymm0
61: c5 fd 58 47 20 vaddpd 0x20(%rdi),%ymm0,%ymm0
66: c5 fd 29 46 20 vmovapd %ymm0,0x20(%rsi)
6b: c5 f8 77 vzeroupper
6e: c3 retq
6f: 90 nop
0000000000000070 <_Z4bar4RKN2Vc6MemoryINS_3AVX6VectorIdEELm5ELm0EEERS4_>:
70: c5 fd 28 07 vmovapd (%rdi),%ymm0
74: c5 fd 29 06 vmovapd %ymm0,(%rsi)
78: c5 fd 28 47 20 vmovapd 0x20(%rdi),%ymm0
7d: c5 fd 29 46 20 vmovapd %ymm0,0x20(%rsi)
82: c5 f8 77 vzeroupper
85: c3 retq
-------------
Best
Sandro
--
Dr. Sandro Wenzel
PH / SFT
CERN
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://compeng.uni-frankfurt.de/pipermail/vc/attachments/20140312/8f05b022/attachment.html>
More information about the Vc
mailing list