#include <stdlib.h> #include <math.h> #define SIZE (1L << 16) void test4(double * restrict a, double * restrict b) { int i; double *x = __builtin_assume_aligned(a, 16); double *y = __builtin_assume_aligned(b, 16); for (i = 0; i < SIZE; i++) { x[i] += y[i]; } }
$ gcc -O3 -std=gnu99 -c test4.c -o test4
0000000000000000 <test4>: 0: 31 c0 xor %eax,%eax 2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 8: 66 0f 28 04 07 movapd (%rdi,%rax,1),%xmm0 d: 66 0f 58 04 06 addpd (%rsi,%rax,1),%xmm0 12: 66 0f 29 04 07 movapd %xmm0,(%rdi,%rax,1) 17: 48 83 c0 10 add $0x10,%rax 1b: 48 3d 00 00 08 00 cmp $0x80000,%rax 21: 75 e5 jne 8 <test4+0x8> 23: f3 c3 repz retq
#include <stdlib.h> #include <math.h> #define SIZE (1L << 16) void test4(double * restrict a, double * restrict b) { int i; for (i = 0; i < SIZE; i++) { a[i] += b[i]; } }
$ clang -O3 -c test4.c -o test4
0000000000000000 <test4>: 0: 31 c0 xor %eax,%eax 2: 66 66 66 66 66 2e 0f data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) 9: 1f 84 00 00 00 00 00 10: 66 0f 10 0c c7 movupd (%rdi,%rax,8),%xmm1 15: 66 0f 10 04 c6 movupd (%rsi,%rax,8),%xmm0 1a: 66 0f 58 c1 addpd %xmm1,%xmm0 1e: 66 0f 10 4c c7 10 movupd 0x10(%rdi,%rax,8),%xmm1 24: 66 0f 10 54 c6 10 movupd 0x10(%rsi,%rax,8),%xmm2 2a: 66 0f 11 04 c7 movupd %xmm0,(%rdi,%rax,8) 2f: 66 0f 58 d1 addpd %xmm1,%xmm2 33: 66 0f 11 54 c7 10 movupd %xmm2,0x10(%rdi,%rax,8) 39: 48 83 c0 04 add $0x4,%rax 3d: 48 3d 00 00 01 00 cmp $0x10000,%rax 43: 75 cb jne 10 <test4+0x10> 45: c3 retq
$ clang -O3 -S -emit-llvm test4.c -o test4.ll
define void @test4(double* noalias nocapture %a, double* noalias nocapture %b) #0 { vector.ph: br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %0 = getelementptr inbounds double* %b, i64 %index %1 = bitcast double* %0 to <2 x double>* %wide.load = load <2 x double>* %1, align 8 %.sum15 = or i64 %index, 2 %2 = getelementptr double* %b, i64 %.sum15 %3 = bitcast double* %2 to <2 x double>* %wide.load10 = load <2 x double>* %3, align 8 %4 = getelementptr inbounds double* %a, i64 %index %5 = bitcast double* %4 to <2 x double>* %wide.load11 = load <2 x double>* %5, align 8 %.sum16 = or i64 %index, 2 %6 = getelementptr double* %a, i64 %.sum16 %7 = bitcast double* %6 to <2 x double>* %wide.load12 = load <2 x double>* %7, align 8 %8 = fadd <2 x double> %wide.load, %wide.load11 %9 = fadd <2 x double> %wide.load10, %wide.load12 store <2 x double> %8, <2 x double>* %5, align 8 store <2 x double> %9, <2 x double>* %7, align 8 %index.next = add i64 %index, 4 %10 = icmp eq i64 %index.next, 65536 br i1 %10, label %for.end, label %vector.body for.end: ; preds = %vector.body ret void }
%8 = fadd <2 x double> %wide.load, %wide.load11 %9 = fadd <2 x double> %wide.load10, %wide.load12