* 尋找適合 autovec 的範例。
* gcc/gcc/testsuite/gcc.dg/vect/
* testsuite/gfortran.dg/vect
* testsuite/g + +.dg/vect
* 研究 [[x86#MMX & SSE|x86]] arm GCC autovec 和 intrisic
* 研究 [[clang]] LLVM 輸出
* 或許改用 dragonegg 輸出
* 是否還會經過 llvm pass?
* 觀察 clang 的 llvm ir 輸出,和 qemu-simd 輸出的 simd ir 比較以得知對應關係
* 開啟 user mode simd
* [[http://gcc.gnu.org/projects/tree-ssa/vectorization.html|Auto-vectorization in GCC]]
* 從中取得範例
* [[http://llvm.org/docs/Vectorizers.html|Auto-Vectorization in LLVM]]
* test-suite/SingleSource/UnitTests/Vectorizer
* [[https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookWritingAutovectorizableCode|10.3.2 Writing Autovectorizable Code]]
* [[http://hal.inria.fr/inria-00589692/|Vapor SIMD: Auto-Vectorize Once, Run Everywhere]]
====== GCC ======
#include
#include
#define SIZE (1L << 16)
void test4(double * restrict a, double * restrict b)
{
int i;
double *x = __builtin_assume_aligned(a, 16);
double *y = __builtin_assume_aligned(b, 16);
for (i = 0; i < SIZE; i++)
{
x[i] += y[i];
}
}
$ gcc -O3 -std=gnu99 -c test4.c -o test4
0000000000000000 :
0: 31 c0 xor %eax,%eax
2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
8: 66 0f 28 04 07 movapd (%rdi,%rax,1),%xmm0
d: 66 0f 58 04 06 addpd (%rsi,%rax,1),%xmm0
12: 66 0f 29 04 07 movapd %xmm0,(%rdi,%rax,1)
17: 48 83 c0 10 add $0x10,%rax
1b: 48 3d 00 00 08 00 cmp $0x80000,%rax
21: 75 e5 jne 8
23: f3 c3 repz retq
* [[http://locklessinc.com/articles/vectorize/|Auto-vectorization with gcc 4.7]]
====== LLVM ======
#include
#include
#define SIZE (1L << 16)
void test4(double * restrict a, double * restrict b)
{
int i;
for (i = 0; i < SIZE; i++)
{
a[i] += b[i];
}
}
$ clang -O3 -c test4.c -o test4
0000000000000000 :
0: 31 c0 xor %eax,%eax
2: 66 66 66 66 66 2e 0f data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
9: 1f 84 00 00 00 00 00
10: 66 0f 10 0c c7 movupd (%rdi,%rax,8),%xmm1
15: 66 0f 10 04 c6 movupd (%rsi,%rax,8),%xmm0
1a: 66 0f 58 c1 addpd %xmm1,%xmm0
1e: 66 0f 10 4c c7 10 movupd 0x10(%rdi,%rax,8),%xmm1
24: 66 0f 10 54 c6 10 movupd 0x10(%rsi,%rax,8),%xmm2
2a: 66 0f 11 04 c7 movupd %xmm0,(%rdi,%rax,8)
2f: 66 0f 58 d1 addpd %xmm1,%xmm2
33: 66 0f 11 54 c7 10 movupd %xmm2,0x10(%rdi,%rax,8)
39: 48 83 c0 04 add $0x4,%rax
3d: 48 3d 00 00 01 00 cmp $0x10000,%rax
43: 75 cb jne 10
45: c3 retq
$ clang -O3 -S -emit-llvm test4.c -o test4.ll
define void @test4(double* noalias nocapture %a, double* noalias nocapture %b) #0 {
vector.ph:
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds double* %b, i64 %index
%1 = bitcast double* %0 to <2 x double>*
%wide.load = load <2 x double>* %1, align 8
%.sum15 = or i64 %index, 2
%2 = getelementptr double* %b, i64 %.sum15
%3 = bitcast double* %2 to <2 x double>*
%wide.load10 = load <2 x double>* %3, align 8
%4 = getelementptr inbounds double* %a, i64 %index
%5 = bitcast double* %4 to <2 x double>*
%wide.load11 = load <2 x double>* %5, align 8
%.sum16 = or i64 %index, 2
%6 = getelementptr double* %a, i64 %.sum16
%7 = bitcast double* %6 to <2 x double>*
%wide.load12 = load <2 x double>* %7, align 8
%8 = fadd <2 x double> %wide.load, %wide.load11
%9 = fadd <2 x double> %wide.load10, %wide.load12
store <2 x double> %8, <2 x double>* %5, align 8
store <2 x double> %9, <2 x double>* %7, align 8
%index.next = add i64 %index, 4
%10 = icmp eq i64 %index.next, 65536
br i1 %10, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void
}
%8 = fadd <2 x double> %wide.load, %wide.load11
%9 = fadd <2 x double> %wide.load10, %wide.load12
* [[http://llvm.org/docs/LangRef.html#fadd-instruction|fadd]]
* [[http://llvm.org/docs/LangRef.html#vector-type|Vector Type]]
* clang 預設 unalign。