• 尋找適合 autovec 的範例。
    • gcc/gcc/testsuite/gcc.dg/vect/
    • testsuite/gfortran.dg/vect
    • testsuite/g + +.dg/vect
  • 研究 x86 arm GCC autovec 和 intrisic
  • 研究 clang LLVM 輸出
    • 或許改用 dragonegg 輸出
    • 是否還會經過 llvm pass?
  • 觀察 clang 的 llvm ir 輸出,和 qemu-simd 輸出的 simd ir 比較以得知對應關係
  • 開啟 user mode simd

GCC

#include <stdlib.h>
#include <math.h>
 
#define SIZE    (1L << 16)
 
void test4(double * restrict a, double * restrict b)
{
        int i;
 
        double *x = __builtin_assume_aligned(a, 16);
        double *y = __builtin_assume_aligned(b, 16);
 
        for (i = 0; i < SIZE; i++)
        {
                x[i] += y[i];
        }
}
$ gcc -O3 -std=gnu99 -c test4.c -o test4
0000000000000000 <test4>:
   0:   31 c0                   xor    %eax,%eax
   2:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
   8:   66 0f 28 04 07          movapd (%rdi,%rax,1),%xmm0
   d:   66 0f 58 04 06          addpd  (%rsi,%rax,1),%xmm0
  12:   66 0f 29 04 07          movapd %xmm0,(%rdi,%rax,1)
  17:   48 83 c0 10             add    $0x10,%rax
  1b:   48 3d 00 00 08 00       cmp    $0x80000,%rax
  21:   75 e5                   jne    8 <test4+0x8>
  23:   f3 c3                   repz retq 

LLVM

#include <stdlib.h>
#include <math.h>
 
#define SIZE    (1L << 16)
 
void test4(double * restrict a, double * restrict b)
{
        int i;
 
        for (i = 0; i < SIZE; i++)
        {
                a[i] += b[i];
        }
}
$ clang -O3 -c test4.c -o test4
0000000000000000 <test4>:
   0:   31 c0                   xor    %eax,%eax
   2:   66 66 66 66 66 2e 0f    data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
   9:   1f 84 00 00 00 00 00 
  10:   66 0f 10 0c c7          movupd (%rdi,%rax,8),%xmm1
  15:   66 0f 10 04 c6          movupd (%rsi,%rax,8),%xmm0
  1a:   66 0f 58 c1             addpd  %xmm1,%xmm0
  1e:   66 0f 10 4c c7 10       movupd 0x10(%rdi,%rax,8),%xmm1
  24:   66 0f 10 54 c6 10       movupd 0x10(%rsi,%rax,8),%xmm2
  2a:   66 0f 11 04 c7          movupd %xmm0,(%rdi,%rax,8)
  2f:   66 0f 58 d1             addpd  %xmm1,%xmm2
  33:   66 0f 11 54 c7 10       movupd %xmm2,0x10(%rdi,%rax,8)
  39:   48 83 c0 04             add    $0x4,%rax
  3d:   48 3d 00 00 01 00       cmp    $0x10000,%rax
  43:   75 cb                   jne    10 <test4+0x10>
  45:   c3                      retq   
$ clang -O3 -S -emit-llvm test4.c -o test4.ll
define void @test4(double* noalias nocapture %a, double* noalias nocapture %b) #0 {
vector.ph:
  br label %vector.body
 
vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %0 = getelementptr inbounds double* %b, i64 %index
  %1 = bitcast double* %0 to <2 x double>*
  %wide.load = load <2 x double>* %1, align 8
  %.sum15 = or i64 %index, 2
  %2 = getelementptr double* %b, i64 %.sum15
  %3 = bitcast double* %2 to <2 x double>*
  %wide.load10 = load <2 x double>* %3, align 8
  %4 = getelementptr inbounds double* %a, i64 %index
  %5 = bitcast double* %4 to <2 x double>*
  %wide.load11 = load <2 x double>* %5, align 8
  %.sum16 = or i64 %index, 2
  %6 = getelementptr double* %a, i64 %.sum16
  %7 = bitcast double* %6 to <2 x double>*
  %wide.load12 = load <2 x double>* %7, align 8
  %8 = fadd <2 x double> %wide.load, %wide.load11
  %9 = fadd <2 x double> %wide.load10, %wide.load12
  store <2 x double> %8, <2 x double>* %5, align 8
  store <2 x double> %9, <2 x double>* %7, align 8
  %index.next = add i64 %index, 4
  %10 = icmp eq i64 %index.next, 65536
  br i1 %10, label %for.end, label %vector.body
 
for.end:                                          ; preds = %vector.body
  ret void
}
  %8 = fadd <2 x double> %wide.load, %wide.load11
  %9 = fadd <2 x double> %wide.load10, %wide.load12
登录