gcc Auto-vectorization
#include <stdio.h>
void doFilter(const short* __restrict src,
short* __restrict dst,
const size_t n)
{
src = (const short*)__builtin_assume_aligned(src, 16);
dst = (short*)__builtin_assume_aligned(dst, 16);
for(size_t i = 0; i < n; ++i){
const short v = src[i] + 10;
dst[i] = v;
}
}
$ g++ -m32 -S -O3 -ftree-vectorizer-verbose=1 -fverbose-asm -msse2 ./vec.cpp
Analyzing loop at ./vec.cpp:10
Vectorizing loop at ./vec.cpp:10
./vec.cpp:10: note: === vect_do_peeling_for_loop_bound ===Setting upper bound of nb iterations for epilogue loop to 6
./vec.cpp:10: note: LOOP VECTORIZED.
./vec.cpp:3: note: vectorized 1 loops in function.
./vec.cpp:10: note: Completely unroll loop 6 times
movl 28(%esp), %eax # n, n
movl 20(%esp), %ebx # src, src
movl 24(%esp), %ecx # dst, dst
testl %eax, %eax # n
je .L1 #,
movl %eax, %edi # n, bnd.4
shrl $3, %edi #, bnd.4 (nは3ビット右シフトしてediに入れておく)
leal 0(,%edi,8), %edx #, i
testl %edx, %edx # i
je .L10 #,
cmpl $7, %eax #, n
jbe .L10 #,
xorl %esi, %esi # ivtmp.22
xorl %ebp, %ebp # D.2999 (ループ変数edpを0にする)
movdqa .LC0, %xmm0 #, tmp162
.L9:
movdqa (%ebx,%esi), %xmm1 # MEM[base: src_4, index: ivtmp.22_41, offset: 0B], vect_var_.13
addl $1, %ebp #, D.2999
paddw %xmm0, %xmm1 # tmp162, vect_var_.13
movdqa %xmm1, (%ecx,%esi) # vect_var_.13, MEM[base: dst_6, index: ivtmp.22_41, offset: 0B]
addl $16, %esi #, ivtmp.22
cmpl %ebp, %edi # D.2999, bnd.4 (ループ変数とn >> 3の比較)
ja .L9 #,