This project provides a unified compute & graphics host API, as well as a unified compute & graphics C++ device language and library to enable same-source CUDA/Host/Metal/OpenCL/Vulkan programming and execution.
The unified host API is implemented at compute and graphics. All backends (CUDA/Host/Metal/OpenCL/Vulkan) currently provide compute support, while graphics support is limited to Metal and Vulkan.
To provide a unified device language, a clang/LLVM/libc++ 14.0 toolchain has been modified.
Certain parts of libfloor are used by both host and device code (math and constexpr). Additional device library code is located at device.
Advanced examples can be found in the floor_examples repository.
Let’s take this fairly simple C++ kernel below that computes the body/body-interactions in a N-body simulation and compile it for each backend. Note that loop unrolling is omitted for conciseness.
// define global constants
static constexpr constant const uint32_t NBODY_TILE_SIZE { 256u };
static constexpr constant const float NBODY_DAMPING { 0.999f };
static constexpr constant const float NBODY_SOFTENING { 0.01f };
// define a kernel with a required local size of (NBODY_TILE_SIZE = 256, 1, 1)
kernel kernel_local_size(NBODY_TILE_SIZE, 1, 1)
void simplified_nbody(buffer<const float4> in_positions, // read-only global memory buffer
buffer<float4> out_positions, // read-write global memory buffer
buffer<float3> inout_velocities, // read-write global memory buffer
param<float> time_delta) { // read-only parameter
// each work-item represents/computes one body
const auto position = in_positions[global_id.x];
auto velocity = inout_velocities[global_id.x];
float3 acceleration; // vectors are automatically zero-initialized
local_buffer<float4, NBODY_TILE_SIZE> local_body_positions; // local memory array allocation
// loop over all bodies
for (uint32_t i = 0, tile = 0, count = global_size.x; i < count; i += NBODY_TILE_SIZE, ++tile) {
// move resp. body position/mass from global to local memory
local_body_positions[local_id.x] = in_positions[tile * NBODY_TILE_SIZE + local_id.x];
local_barrier(); // barrier across all work-items in this work-group
// loop over bodies in this work-group
for (uint32_t j = 0; j < NBODY_TILE_SIZE; ++j) {
const auto r = local_body_positions[j].xyz - position.xyz;
const auto dist_sq = r.dot(r) + (NBODY_SOFTENING * NBODY_SOFTENING);
const auto inv_dist = rsqrt(dist_sq);
const auto s = local_body_positions[j].w * (inv_dist * inv_dist * inv_dist); // .w is mass
acceleration += r * s;
}
local_barrier();
}
velocity = (velocity + acceleration * time_delta) * NBODY_DAMPING;
out_positions[global_id.x].xyz += velocity * time_delta; // update XYZ position
inout_velocities[global_id.x] = velocity; // update velocity
}
click to unfold the output for each backend
CUDA / PTX
You can download the PTX file here and the CUBIN file here (note that building CUBINs is optional and requiresptxas
).
//
// Generated by LLVM NVPTX Back-End
//
.version 7.2
.target sm_86
.address_size 64
// _ZZ16simplified_nbodyE20local_body_positions has been demoted
.const .align 4 .f32 _ZL13NBODY_DAMPING = 0f3F7FBE77;
// -- Begin function simplified_nbody
// @simplified_nbody
.visible .entry simplified_nbody(
.param .u64 simplified_nbody_param_0,
.param .u64 simplified_nbody_param_1,
.param .u64 simplified_nbody_param_2,
.param .f32 simplified_nbody_param_3
)
.reqntid 256, 1, 1
{
.reg .pred %p<3>;
.reg .f32 %f<72>;
.reg .b32 %r<25>;
.reg .b64 %rd<18>;
// demoted variable
.shared .align 4 .b8 _ZZ16simplified_nbodyE20local_body_positions[4096];
// %bb.0:
mov.u32 %r1, %tid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %ntid.x;
mad.lo.s32 %r13, %r12, %r11, %r1;
cvt.u64.u32 %rd3, %r13;
mul.wide.u32 %rd7, %r13, 12;
ld.param.u64 %rd8, [simplified_nbody_param_2];
cvta.to.global.u64 %rd9, %rd8;
add.s64 %rd4, %rd9, %rd7;
ld.global.f32 %f6, [%rd4+8];
add.s64 %rd6, %rd4, 8;
ld.global.f32 %f5, [%rd4+4];
add.s64 %rd5, %rd4, 4;
ld.global.f32 %f4, [%rd4];
mul.wide.u32 %rd10, %r13, 16;
ld.param.u64 %rd11, [simplified_nbody_param_0];
cvta.to.global.u64 %rd2, %rd11;
add.s64 %rd12, %rd2, %rd10;
ld.global.nc.f32 %f3, [%rd12+8];
ld.global.nc.f32 %f2, [%rd12+4];
ld.global.nc.f32 %f1, [%rd12];
mov.u32 %r14, %nctaid.x;
mul.lo.s32 %r2, %r14, %r12;
shl.b32 %r15, %r1, 4;
mov.u32 %r16, _ZZ16simplified_nbodyE20local_body_positions;
add.s32 %r3, %r16, %r15;
ld.param.u64 %rd13, [simplified_nbody_param_1];
cvta.to.global.u64 %rd1, %rd13;
mov.f32 %f69, 0f00000000;
mov.u32 %r10, 0;
ld.param.f32 %f16, [simplified_nbody_param_3];
mov.u32 %r22, %r10;
mov.u32 %r23, %r10;
mov.f32 %f70, %f69;
mov.f32 %f71, %f69;
LBB0_1: // =>This Loop Header: Depth=1
// Child Loop BB0_2 Depth 2
shl.b32 %r18, %r23, 8;
add.s32 %r19, %r18, %r1;
mul.wide.u32 %rd14, %r19, 16;
add.s64 %rd15, %rd2, %rd14;
ld.global.nc.f32 %f18, [%rd15];
st.shared.f32 [%r3], %f18;
ld.global.nc.f32 %f19, [%rd15+4];
st.shared.f32 [%r3+4], %f19;
ld.global.nc.f32 %f20, [%rd15+8];
st.shared.f32 [%r3+8], %f20;
ld.global.nc.f32 %f21, [%rd15+12];
st.shared.f32 [%r3+12], %f21;
bar.sync 0;
mov.u32 %r24, %r10;
LBB0_2: // Parent Loop BB0_1 Depth=1
// => This Inner Loop Header: Depth=2
add.s32 %r21, %r16, %r24;
ld.shared.f32 %f22, [%r21+4];
sub.f32 %f23, %f22, %f2;
ld.shared.f32 %f24, [%r21];
sub.f32 %f25, %f24, %f1;
fma.rn.f32 %f26, %f25, %f25, 0f38D1B717;
fma.rn.f32 %f27, %f23, %f23, %f26;
ld.shared.f32 %f28, [%r21+8];
sub.f32 %f29, %f28, %f3;
fma.rn.f32 %f30, %f29, %f29, %f27;
rsqrt.approx.ftz.f32 %f31, %f30;
mul.f32 %f32, %f31, %f31;
mul.f32 %f33, %f32, %f31;
ld.shared.f32 %f34, [%r21+12];
mul.f32 %f35, %f33, %f34;
fma.rn.f32 %f36, %f35, %f29, %f69;
ld.shared.f32 %f37, [%r21+20];
sub.f32 %f38, %f37, %f2;
ld.shared.f32 %f39, [%r21+16];
sub.f32 %f40, %f39, %f1;
fma.rn.f32 %f41, %f40, %f40, 0f38D1B717;
fma.rn.f32 %f42, %f38, %f38, %f41;
ld.shared.f32 %f43, [%r21+24];
sub.f32 %f44, %f43, %f3;
fma.rn.f32 %f45, %f44, %f44, %f42;
rsqrt.approx.ftz.f32 %f46, %f45;
mul.f32 %f47, %f46, %f46;
mul.f32 %f48, %f47, %f46;
ld.shared.f32 %f49, [%r21+28];
mul.f32 %f50, %f48, %f49;
fma.rn.f32 %f69, %f50, %f44, %f36;
fma.rn.f32 %f51, %f35, %f23, %f70;
fma.rn.f32 %f70, %f50, %f38, %f51;
fma.rn.f32 %f52, %f35, %f25, %f71;
fma.rn.f32 %f71, %f50, %f40, %f52;
add.s32 %r24, %r24, 32;
setp.eq.s32 %p1, %r24, 4096;
@%p1 bra LBB0_3;
bra.uni LBB0_2;
LBB0_3: // in Loop: Header=BB0_1 Depth=1
add.s32 %r22, %r22, 256;
setp.lt.u32 %p2, %r22, %r2;
bar.sync 0;
add.s32 %r23, %r23, 1;
@%p2 bra LBB0_1;
// %bb.4:
fma.rn.f32 %f53, %f71, %f16, %f4;
ld.const.f32 %f54, [_ZL13NBODY_DAMPING];
mul.f32 %f55, %f54, %f53;
shl.b64 %rd16, %rd3, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.f32 %f56, [%rd17];
fma.rn.f32 %f57, %f55, %f16, %f56;
st.global.f32 [%rd17], %f57;
fma.rn.f32 %f58, %f70, %f16, %f5;
mul.f32 %f59, %f54, %f58;
ld.global.f32 %f60, [%rd17+4];
fma.rn.f32 %f61, %f59, %f16, %f60;
st.global.f32 [%rd17+4], %f61;
fma.rn.f32 %f62, %f69, %f16, %f6;
mul.f32 %f63, %f54, %f62;
ld.global.f32 %f64, [%rd17+8];
fma.rn.f32 %f65, %f63, %f16, %f64;
st.global.f32 [%rd17+8], %f65;
st.global.f32 [%rd4], %f55;
st.global.f32 [%rd5], %f59;
st.global.f32 [%rd6], %f63;
ret;
// -- End function
}
Host-Compute (x86 CPU)
Note that the compiler would usually directly output a .bin file (ELF format). The output below comes from disassembling it withobjdump -d
.
Also note that this has been compiled for the x86-4
target (AVX-512).
nbody.bin: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <simplified_nbody>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: 41 57 push %r15
6: 41 56 push %r14
8: 41 55 push %r13
a: 41 54 push %r12
c: 53 push %rbx
d: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
11: 48 81 ec 40 04 00 00 sub $0x440,%rsp
18: 48 89 4c 24 50 mov %rcx,0x50(%rsp)
1d: 48 89 74 24 70 mov %rsi,0x70(%rsp)
22: 48 89 fb mov %rdi,%rbx
25: 48 8d 05 f9 ff ff ff lea -0x7(%rip),%rax # 25 <simplified_nbody+0x25>
2c: 48 bf 00 00 00 00 00 movabs $0x0,%rdi
33: 00 00 00
36: 48 01 c7 add %rax,%rdi
39: 48 b8 00 00 00 00 00 movabs $0x0,%rax
40: 00 00 00
43: 48 8b 04 07 mov (%rdi,%rax,1),%rax
47: 8b 00 mov (%rax),%eax
49: 48 8d 0c 40 lea (%rax,%rax,2),%rcx
4d: 48 89 c6 mov %rax,%rsi
50: 48 c1 e6 04 shl $0x4,%rsi
54: 48 8d 04 8a lea (%rdx,%rcx,4),%rax
58: 48 89 44 24 68 mov %rax,0x68(%rsp)
5d: c5 fa 10 04 8a vmovss (%rdx,%rcx,4),%xmm0
62: c5 fa 11 44 24 10 vmovss %xmm0,0x10(%rsp)
68: c5 fa 10 44 8a 04 vmovss 0x4(%rdx,%rcx,4),%xmm0
6e: c5 fa 11 44 24 14 vmovss %xmm0,0x14(%rsp)
74: c5 fa 10 44 8a 08 vmovss 0x8(%rdx,%rcx,4),%xmm0
7a: c5 fa 11 44 24 18 vmovss %xmm0,0x18(%rsp)
80: 48 b8 00 00 00 00 00 movabs $0x0,%rax
87: 00 00 00
8a: 48 8b 04 07 mov (%rdi,%rax,1),%rax
8e: 8b 00 mov (%rax),%eax
90: 89 44 24 1c mov %eax,0x1c(%rsp)
94: 85 c0 test %eax,%eax
96: 48 89 74 24 60 mov %rsi,0x60(%rsp)
9b: 48 89 7c 24 58 mov %rdi,0x58(%rsp)
a0: 0f 84 44 05 00 00 je 5ea <simplified_nbody+0x5ea>
a6: c5 fa 10 04 33 vmovss (%rbx,%rsi,1),%xmm0
ab: c5 fa 10 4c 33 04 vmovss 0x4(%rbx,%rsi,1),%xmm1
b1: c5 fa 10 54 33 08 vmovss 0x8(%rbx,%rsi,1),%xmm2
b7: 62 f2 7d 48 18 c0 vbroadcastss %xmm0,%zmm0
bd: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x3c0(%rsp)
c4: 0f
c5: 62 f2 7d 48 18 c1 vbroadcastss %xmm1,%zmm0
cb: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x380(%rsp)
d2: 0e
d3: 62 f2 7d 48 18 c2 vbroadcastss %xmm2,%zmm0
d9: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x340(%rsp)
e0: 0d
e1: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
e5: c5 f8 29 44 24 30 vmovaps %xmm0,0x30(%rsp)
eb: 45 31 ff xor %r15d,%r15d
ee: 48 b8 00 00 00 00 00 movabs $0x0,%rax
f5: 00 00 00
f8: 48 8b 04 07 mov (%rdi,%rax,1),%rax
fc: 48 89 44 24 78 mov %rax,0x78(%rsp)
101: 49 be 00 00 00 00 00 movabs $0x0,%r14
108: 00 00 00
10b: 49 01 fe add %rdi,%r14
10e: 48 b8 00 00 00 00 00 movabs $0x0,%rax
115: 00 00 00
118: 4c 8b 2c 07 mov (%rdi,%rax,1),%r13
11c: 48 b8 00 00 00 00 00 movabs $0x0,%rax
123: 00 00 00
126: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
12d: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x300(%rsp)
134: 0c
135: 48 b8 00 00 00 00 00 movabs $0x0,%rax
13c: 00 00 00
13f: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
146: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x2c0(%rsp)
14d: 0b
14e: 48 b8 00 00 00 00 00 movabs $0x0,%rax
155: 00 00 00
158: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
15f: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x280(%rsp)
166: 0a
167: 48 b8 00 00 00 00 00 movabs $0x0,%rax
16e: 00 00 00
171: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
178: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x240(%rsp)
17f: 09
180: 48 b8 00 00 00 00 00 movabs $0x0,%rax
187: 00 00 00
18a: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
191: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x200(%rsp)
198: 08
199: 48 b8 00 00 00 00 00 movabs $0x0,%rax
1a0: 00 00 00
1a3: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
1aa: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x1c0(%rsp)
1b1: 07
1b2: 48 b8 00 00 00 00 00 movabs $0x0,%rax
1b9: 00 00 00
1bc: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
1c3: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x180(%rsp)
1ca: 06
1cb: 48 b8 00 00 00 00 00 movabs $0x0,%rax
1d2: 00 00 00
1d5: 62 f1 7c 48 28 04 07 vmovaps (%rdi,%rax,1),%zmm0
1dc: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x140(%rsp)
1e3: 05
1e4: 48 b8 00 00 00 00 00 movabs $0x0,%rax
1eb: 00 00 00
1ee: 62 f2 7d 48 18 04 07 vbroadcastss (%rdi,%rax,1),%zmm0
1f5: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x100(%rsp)
1fc: 04
1fd: 48 b8 00 00 00 00 00 movabs $0x0,%rax
204: 00 00 00
207: 62 f2 7d 48 18 04 07 vbroadcastss (%rdi,%rax,1),%zmm0
20e: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0xc0(%rsp)
215: 03
216: 48 b8 00 00 00 00 00 movabs $0x0,%rax
21d: 00 00 00
220: 62 f2 7d 48 18 04 07 vbroadcastss (%rdi,%rax,1),%zmm0
227: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x80(%rsp)
22e: 02
22f: 45 31 e4 xor %r12d,%r12d
232: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
236: c5 f8 29 44 24 20 vmovaps %xmm0,0x20(%rsp)
23c: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
240: c5 f8 29 44 24 40 vmovaps %xmm0,0x40(%rsp)
246: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1)
24d: 00 00 00
250: 44 89 e0 mov %r12d,%eax
253: c1 e0 08 shl $0x8,%eax
256: 48 8b 4c 24 78 mov 0x78(%rsp),%rcx
25b: 8b 09 mov (%rcx),%ecx
25d: 01 c8 add %ecx,%eax
25f: 48 c1 e0 04 shl $0x4,%rax
263: 48 c1 e1 04 shl $0x4,%rcx
267: c5 f8 10 04 03 vmovups (%rbx,%rax,1),%xmm0
26c: c4 a1 78 29 04 31 vmovaps %xmm0,(%rcx,%r14,1)
272: c5 f8 77 vzeroupper
275: 41 ff d5 call *%r13
278: c5 e0 57 db vxorps %xmm3,%xmm3,%xmm3
27c: c4 e3 61 0c 44 24 30 vblendps $0x1,0x30(%rsp),%xmm3,%xmm0
283: 01
284: c5 f0 57 c9 vxorps %xmm1,%xmm1,%xmm1
288: c4 e3 61 0c 54 24 20 vblendps $0x1,0x20(%rsp),%xmm3,%xmm2
28f: 01
290: c4 e3 61 0c 64 24 40 vblendps $0x1,0x40(%rsp),%xmm3,%xmm4
297: 01
298: b8 00 00 00 00 mov $0x0,%eax
29d: c5 e0 57 db vxorps %xmm3,%xmm3,%xmm3
2a1: c5 d0 57 ed vxorps %xmm5,%xmm5,%xmm5
2a5: 62 61 7c 48 28 44 24 vmovaps 0x300(%rsp),%zmm24
2ac: 0c
2ad: 62 61 7c 48 28 4c 24 vmovaps 0x2c0(%rsp),%zmm25
2b4: 0b
2b5: 62 61 7c 48 28 54 24 vmovaps 0x280(%rsp),%zmm26
2bc: 0a
2bd: 62 61 7c 48 28 5c 24 vmovaps 0x240(%rsp),%zmm27
2c4: 09
2c5: 62 61 7c 48 28 64 24 vmovaps 0x200(%rsp),%zmm28
2cc: 08
2cd: 62 61 7c 48 28 6c 24 vmovaps 0x1c0(%rsp),%zmm29
2d4: 07
2d5: 62 61 7c 48 28 74 24 vmovaps 0x180(%rsp),%zmm30
2dc: 06
2dd: 62 61 7c 48 28 7c 24 vmovaps 0x140(%rsp),%zmm31
2e4: 05
2e5: 62 e1 7c 48 28 6c 24 vmovaps 0x100(%rsp),%zmm21
2ec: 04
2ed: 62 e1 7c 48 28 74 24 vmovaps 0xc0(%rsp),%zmm22
2f4: 03
2f5: 62 e1 7c 48 28 7c 24 vmovaps 0x80(%rsp),%zmm23
2fc: 02
2fd: 0f 1f 00 nopl (%rax)
300: 62 b1 7c 48 28 3c 30 vmovaps (%rax,%r14,1),%zmm7
307: 62 31 7c 48 28 44 30 vmovaps 0x40(%rax,%r14,1),%zmm8
30e: 01
30f: 62 31 7c 48 28 4c 30 vmovaps 0x80(%rax,%r14,1),%zmm9
316: 02
317: 62 31 7c 48 28 54 30 vmovaps 0xc0(%rax,%r14,1),%zmm10
31e: 03
31f: 62 31 7c 48 28 5c 30 vmovaps 0x140(%rax,%r14,1),%zmm11
326: 05
327: 62 31 7c 48 28 64 30 vmovaps 0x100(%rax,%r14,1),%zmm12
32e: 04
32f: 62 31 7c 48 28 6c 30 vmovaps 0x1c0(%rax,%r14,1),%zmm13
336: 07
337: 62 31 7c 48 28 74 30 vmovaps 0x180(%rax,%r14,1),%zmm14
33e: 06
33f: 62 d1 7c 48 28 f1 vmovaps %zmm9,%zmm6
345: 62 d2 3d 40 7f f2 vpermt2ps %zmm10,%zmm24,%zmm6
34b: 62 71 7c 48 28 ff vmovaps %zmm7,%zmm15
351: 62 52 35 40 7f f8 vpermt2ps %zmm8,%zmm25,%zmm15
357: 62 c1 7c 48 28 c6 vmovaps %zmm14,%zmm16
35d: 62 c2 3d 40 7f c5 vpermt2ps %zmm13,%zmm24,%zmm16
363: 62 c1 7c 48 28 cc vmovaps %zmm12,%zmm17
369: 62 73 85 48 23 fe e4 vshuff64x2 $0xe4,%zmm6,%zmm15,%zmm15
370: 62 c2 35 40 7f cb vpermt2ps %zmm11,%zmm25,%zmm17
376: 62 b3 f5 40 23 f0 e4 vshuff64x2 $0xe4,%zmm16,%zmm17,%zmm6
37d: 62 c1 7c 48 28 c1 vmovaps %zmm9,%zmm16
383: 62 c2 2d 40 7f c2 vpermt2ps %zmm10,%zmm26,%zmm16
389: 62 e1 7c 48 28 cf vmovaps %zmm7,%zmm17
38f: 62 c2 25 40 7f c8 vpermt2ps %zmm8,%zmm27,%zmm17
395: 62 c1 7c 48 28 d6 vmovaps %zmm14,%zmm18
39b: 62 c2 2d 40 7f d5 vpermt2ps %zmm13,%zmm26,%zmm18
3a1: 62 c1 7c 48 28 dc vmovaps %zmm12,%zmm19
3a7: 62 a3 f5 40 23 c0 e4 vshuff64x2 $0xe4,%zmm16,%zmm17,%zmm16
3ae: 62 c2 25 40 7f db vpermt2ps %zmm11,%zmm27,%zmm19
3b4: 62 a3 e5 40 23 ca e4 vshuff64x2 $0xe4,%zmm18,%zmm19,%zmm17
3bb: 62 c1 7c 48 28 d1 vmovaps %zmm9,%zmm18
3c1: 62 c2 1d 40 7f d2 vpermt2ps %zmm10,%zmm28,%zmm18
3c7: 62 e1 7c 48 28 df vmovaps %zmm7,%zmm19
3cd: 62 c2 15 40 7f d8 vpermt2ps %zmm8,%zmm29,%zmm19
3d3: 62 c1 7c 48 28 e6 vmovaps %zmm14,%zmm20
3d9: 62 c2 1d 40 7f e5 vpermt2ps %zmm13,%zmm28,%zmm20
3df: 62 a3 e5 40 23 d2 e4 vshuff64x2 $0xe4,%zmm18,%zmm19,%zmm18
3e6: 62 c1 7c 48 28 dc vmovaps %zmm12,%zmm19
3ec: 62 c2 15 40 7f db vpermt2ps %zmm11,%zmm29,%zmm19
3f2: 62 a3 e5 40 23 dc e4 vshuff64x2 $0xe4,%zmm20,%zmm19,%zmm19
3f9: 62 52 0d 40 7f ca vpermt2ps %zmm10,%zmm30,%zmm9
3ff: 62 d2 05 40 7f f8 vpermt2ps %zmm8,%zmm31,%zmm7
405: 62 52 0d 40 7f f5 vpermt2ps %zmm13,%zmm30,%zmm14
40b: 62 d3 c5 48 23 f9 e4 vshuff64x2 $0xe4,%zmm9,%zmm7,%zmm7
412: 62 52 05 40 7f e3 vpermt2ps %zmm11,%zmm31,%zmm12
418: 62 53 9d 48 23 c6 e4 vshuff64x2 $0xe4,%zmm14,%zmm12,%zmm8
41f: 62 71 7c 48 28 74 24 vmovaps 0x3c0(%rsp),%zmm14
426: 0f
427: 62 51 04 48 5c ce vsubps %zmm14,%zmm15,%zmm9
42d: 62 71 7c 48 28 7c 24 vmovaps 0x380(%rsp),%zmm15
434: 0e
435: 62 51 7c 40 5c d7 vsubps %zmm15,%zmm16,%zmm10
43b: 62 e1 7c 48 28 44 24 vmovaps 0x340(%rsp),%zmm16
442: 0d
443: 62 31 6c 40 5c d8 vsubps %zmm16,%zmm18,%zmm11
449: 62 51 7c 48 28 e1 vmovaps %zmm9,%zmm12
44f: 62 32 35 48 a8 e5 vfmadd213ps %zmm21,%zmm9,%zmm12
455: 62 52 2d 48 b8 e2 vfmadd231ps %zmm10,%zmm10,%zmm12
45b: 62 52 25 48 b8 e3 vfmadd231ps %zmm11,%zmm11,%zmm12
461: 62 52 7d 48 4e ec vrsqrt14ps %zmm12,%zmm13
467: 62 51 1c 48 59 e5 vmulps %zmm13,%zmm12,%zmm12
46d: 62 32 15 48 a8 e6 vfmadd213ps %zmm22,%zmm13,%zmm12
473: 62 31 14 48 59 ef vmulps %zmm23,%zmm13,%zmm13
479: 62 51 14 48 59 e4 vmulps %zmm12,%zmm13,%zmm12
47f: 62 d1 44 48 59 fc vmulps %zmm12,%zmm7,%zmm7
485: 62 51 1c 48 59 e4 vmulps %zmm12,%zmm12,%zmm12
48b: 62 f1 1c 48 59 ff vmulps %zmm7,%zmm12,%zmm7
491: 62 d2 45 48 b8 e1 vfmadd231ps %zmm9,%zmm7,%zmm4
497: 62 d2 45 48 b8 d2 vfmadd231ps %zmm10,%zmm7,%zmm2
49d: 62 d1 4c 48 5c f6 vsubps %zmm14,%zmm6,%zmm6
4a3: 62 51 74 40 5c cf vsubps %zmm15,%zmm17,%zmm9
4a9: 62 31 64 40 5c d0 vsubps %zmm16,%zmm19,%zmm10
4af: 62 d2 45 48 b8 c3 vfmadd231ps %zmm11,%zmm7,%zmm0
4b5: 62 f1 7c 48 28 fe vmovaps %zmm6,%zmm7
4bb: 62 b2 4d 48 a8 fd vfmadd213ps %zmm21,%zmm6,%zmm7
4c1: 62 d2 35 48 b8 f9 vfmadd231ps %zmm9,%zmm9,%zmm7
4c7: 62 d2 2d 48 b8 fa vfmadd231ps %zmm10,%zmm10,%zmm7
4cd: 62 72 7d 48 4e df vrsqrt14ps %zmm7,%zmm11
4d3: 62 d1 44 48 59 fb vmulps %zmm11,%zmm7,%zmm7
4d9: 62 b2 25 48 a8 fe vfmadd213ps %zmm22,%zmm11,%zmm7
4df: 62 31 24 48 59 df vmulps %zmm23,%zmm11,%zmm11
4e5: 62 f1 24 48 59 ff vmulps %zmm7,%zmm11,%zmm7
4eb: 62 71 3c 48 59 c7 vmulps %zmm7,%zmm8,%zmm8
4f1: 62 f1 44 48 59 ff vmulps %zmm7,%zmm7,%zmm7
4f7: 62 d1 44 48 59 f8 vmulps %zmm8,%zmm7,%zmm7
4fd: 62 f2 45 48 b8 ee vfmadd231ps %zmm6,%zmm7,%zmm5
503: 62 d2 45 48 b8 d9 vfmadd231ps %zmm9,%zmm7,%zmm3
509: 62 d2 45 48 b8 ca vfmadd231ps %zmm10,%zmm7,%zmm1
50f: 48 05 00 02 00 00 add $0x200,%rax
515: 48 3d 00 10 00 00 cmp $0x1000,%rax
51b: 0f 85 df fd ff ff jne 300 <simplified_nbody+0x300>
521: 62 f1 54 48 58 e4 vaddps %zmm4,%zmm5,%zmm4
527: 62 f3 fd 48 1b e5 01 vextractf64x4 $0x1,%zmm4,%ymm5
52e: 62 f1 5c 48 58 e5 vaddps %zmm5,%zmm4,%zmm4
534: c4 e3 7d 19 e5 01 vextractf128 $0x1,%ymm4,%xmm5
53a: 62 f1 5c 48 58 e5 vaddps %zmm5,%zmm4,%zmm4
540: c4 e3 79 05 ec 01 vpermilpd $0x1,%xmm4,%xmm5
546: 62 f1 5c 48 58 e5 vaddps %zmm5,%zmm4,%zmm4
54c: c5 fa 16 ec vmovshdup %xmm4,%xmm5
550: c5 d8 58 e5 vaddps %xmm5,%xmm4,%xmm4
554: c5 f8 29 64 24 40 vmovaps %xmm4,0x40(%rsp)
55a: 62 f1 64 48 58 d2 vaddps %zmm2,%zmm3,%zmm2
560: 62 f3 fd 48 1b d3 01 vextractf64x4 $0x1,%zmm2,%ymm3
567: 62 f1 6c 48 58 d3 vaddps %zmm3,%zmm2,%zmm2
56d: c4 e3 7d 19 d3 01 vextractf128 $0x1,%ymm2,%xmm3
573: 62 f1 6c 48 58 d3 vaddps %zmm3,%zmm2,%zmm2
579: c4 e3 79 05 da 01 vpermilpd $0x1,%xmm2,%xmm3
57f: 62 f1 6c 48 58 d3 vaddps %zmm3,%zmm2,%zmm2
585: c5 fa 16 da vmovshdup %xmm2,%xmm3
589: c5 e8 58 d3 vaddps %xmm3,%xmm2,%xmm2
58d: c5 f8 29 54 24 20 vmovaps %xmm2,0x20(%rsp)
593: 62 f1 74 48 58 c0 vaddps %zmm0,%zmm1,%zmm0
599: 62 f3 fd 48 1b c1 01 vextractf64x4 $0x1,%zmm0,%ymm1
5a0: 62 f1 7c 48 58 c1 vaddps %zmm1,%zmm0,%zmm0
5a6: c4 e3 7d 19 c1 01 vextractf128 $0x1,%ymm0,%xmm1
5ac: 62 f1 7c 48 58 c1 vaddps %zmm1,%zmm0,%zmm0
5b2: c4 e3 79 05 c8 01 vpermilpd $0x1,%xmm0,%xmm1
5b8: 62 f1 7c 48 58 c1 vaddps %zmm1,%zmm0,%zmm0
5be: c5 fa 16 c8 vmovshdup %xmm0,%xmm1
5c2: c5 f8 58 c1 vaddps %xmm1,%xmm0,%xmm0
5c6: c5 f8 29 44 24 30 vmovaps %xmm0,0x30(%rsp)
5cc: c5 f8 77 vzeroupper
5cf: 41 ff d5 call *%r13
5d2: 41 81 c7 00 01 00 00 add $0x100,%r15d
5d9: 41 83 c4 01 add $0x1,%r12d
5dd: 44 3b 7c 24 1c cmp 0x1c(%rsp),%r15d
5e2: 0f 82 68 fc ff ff jb 250 <simplified_nbody+0x250>
5e8: eb 1e jmp 608 <simplified_nbody+0x608>
5ea: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
5ee: c5 f8 29 44 24 40 vmovaps %xmm0,0x40(%rsp)
5f4: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
5f8: c5 f8 29 44 24 20 vmovaps %xmm0,0x20(%rsp)
5fe: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
602: c5 f8 29 44 24 30 vmovaps %xmm0,0x30(%rsp)
608: 48 8b 44 24 50 mov 0x50(%rsp),%rax
60d: c5 fa 10 00 vmovss (%rax),%xmm0
611: c5 f8 28 54 24 40 vmovaps 0x40(%rsp),%xmm2
617: c4 e2 79 a9 54 24 10 vfmadd213ss 0x10(%rsp),%xmm0,%xmm2
61e: c5 f8 28 5c 24 20 vmovaps 0x20(%rsp),%xmm3
624: c4 e2 79 a9 5c 24 14 vfmadd213ss 0x14(%rsp),%xmm0,%xmm3
62b: 48 b8 00 00 00 00 00 movabs $0x0,%rax
632: 00 00 00
635: 48 8b 4c 24 58 mov 0x58(%rsp),%rcx
63a: c5 fa 10 0c 01 vmovss (%rcx,%rax,1),%xmm1
63f: c5 f8 28 64 24 30 vmovaps 0x30(%rsp),%xmm4
645: c4 e2 79 a9 64 24 18 vfmadd213ss 0x18(%rsp),%xmm0,%xmm4
64c: c5 ea 59 d1 vmulss %xmm1,%xmm2,%xmm2
650: c5 e2 59 d9 vmulss %xmm1,%xmm3,%xmm3
654: c5 da 59 c9 vmulss %xmm1,%xmm4,%xmm1
658: 48 8b 44 24 70 mov 0x70(%rsp),%rax
65d: 48 8b 4c 24 60 mov 0x60(%rsp),%rcx
662: c5 fa 10 24 08 vmovss (%rax,%rcx,1),%xmm4
667: c4 e2 69 b9 e0 vfmadd231ss %xmm0,%xmm2,%xmm4
66c: c5 fa 11 24 08 vmovss %xmm4,(%rax,%rcx,1)
671: c5 fa 10 64 08 04 vmovss 0x4(%rax,%rcx,1),%xmm4
677: c4 e2 61 b9 e0 vfmadd231ss %xmm0,%xmm3,%xmm4
67c: c5 fa 11 64 08 04 vmovss %xmm4,0x4(%rax,%rcx,1)
682: c4 e2 71 a9 44 08 08 vfmadd213ss 0x8(%rax,%rcx,1),%xmm1,%xmm0
689: c5 fa 11 44 08 08 vmovss %xmm0,0x8(%rax,%rcx,1)
68f: 48 8b 44 24 68 mov 0x68(%rsp),%rax
694: c5 fa 11 10 vmovss %xmm2,(%rax)
698: c5 fa 11 58 04 vmovss %xmm3,0x4(%rax)
69d: c5 fa 11 48 08 vmovss %xmm1,0x8(%rax)
6a2: 48 8d 65 d8 lea -0x28(%rbp),%rsp
6a6: 5b pop %rbx
6a7: 41 5c pop %r12
6a9: 41 5d pop %r13
6ab: 41 5e pop %r14
6ad: 41 5f pop %r15
6af: 5d pop %rbp
6b0: c3 ret
Host-Compute (ARM CPU)
Note that the compiler would usually directly output a .bin file (ELF format). The output below comes from disassembling it withobjdump -d
.
Also note that this has been compiled for the arm-5
target (ARMv8.4-A + FP16, e.g. Apple M1/A14).
nbody_aarch64.bin: file format elf64-littleaarch64
Disassembly of section .text:
0000000000000000 <simplified_nbody>:
0: ff c3 04 d1 sub sp, sp, #304
4: 08 00 00 90 adrp x8, 0x0 <simplified_nbody+0x4>
8: ed 33 0a 6d stp d13, d12, [sp, #160]
c: eb 2b 0b 6d stp d11, d10, [sp, #176]
10: e9 23 0c 6d stp d9, d8, [sp, #192]
14: fd 7b 0d a9 stp x29, x30, [sp, #208]
18: fd 43 03 91 add x29, sp, #208
1c: fc 6f 0e a9 stp x28, x27, [sp, #224]
20: fa 67 0f a9 stp x26, x25, [sp, #240]
24: f8 5f 10 a9 stp x24, x23, [sp, #256]
28: f6 57 11 a9 stp x22, x21, [sp, #272]
2c: f4 4f 12 a9 stp x20, x19, [sp, #288]
30: 08 01 40 f9 ldr x8, [x8]
34: 17 01 40 b9 ldr w23, [x8]
38: 88 01 80 52 mov w8, #12
3c: f6 0a 08 9b madd x22, x23, x8, x2
40: 08 00 00 90 adrp x8, 0x0 <simplified_nbody+0x40>
44: f8 03 16 aa mov x24, x22
48: 08 01 40 f9 ldr x8, [x8]
4c: c8 02 40 fd ldr d8, [x22]
50: 09 8f 40 bc ldr s9, [x24, #8]!
54: 19 01 40 b9 ldr w25, [x8]
58: 79 0d 00 34 cbz w25, 0x204 <simplified_nbody+0x204>
5c: 03 e4 00 2f movi d3, #0000000000000000
60: 08 10 17 8b add x8, x0, x23, lsl #4
64: 1c 00 00 90 adrp x28, 0x0 <simplified_nbody+0x64>
68: e3 07 00 a9 stp x3, x1, [sp]
6c: 13 00 00 90 adrp x19, 0x0 <simplified_nbody+0x6c>
70: 14 00 00 90 adrp x20, 0x0 <simplified_nbody+0x70>
74: f5 03 00 aa mov x21, x0
78: fa 03 1f 2a mov w26, wzr
7c: 9c 03 40 f9 ldr x28, [x28]
80: a3 03 9a 3c stur q3, [x29, #-96]
84: 00 05 40 2d ldp s0, s1, [x8]
88: 02 09 40 bd ldr s2, [x8, #8]
8c: e8 e2 96 52 mov w8, #46871
90: 03 f6 03 4f fmov v3.4s, #1.00000000
94: 28 1a a7 72 movk w8, #14545, lsl #16
98: fb 03 1f 2a mov w27, wzr
9c: e3 17 80 3d str q3, [sp, #80]
a0: 03 04 04 4e dup v3.4s, v0.s[0]
a4: 00 0d 04 4e dup v0.4s, w8
a8: 73 02 40 f9 ldr x19, [x19]
ac: e0 8f 01 ad stp q0, q3, [sp, #48]
b0: 00 e4 00 2f movi d0, #0000000000000000
b4: 23 04 04 4e dup v3.4s, v1.s[0]
b8: e0 1b 80 3d str q0, [sp, #96]
bc: 00 e4 00 2f movi d0, #0000000000000000
c0: a0 03 9b 3c stur q0, [x29, #-80]
c4: 40 04 04 4e dup v0.4s, v2.s[0]
c8: 94 02 40 f9 ldr x20, [x20]
cc: e0 8f 00 ad stp q0, q3, [sp, #16]
d0: 88 03 40 b9 ldr w8, [x28]
d4: 09 21 1b 0b add w9, w8, w27, lsl #8
d8: a0 5a e9 3c ldr q0, [x21, w9, uxtw #4]
dc: 60 7a a8 3c str q0, [x19, x8, lsl #4]
e0: 80 02 3f d6 blr x20
e4: 00 e4 00 6f movi v0.2d, #0000000000000000
e8: a4 03 da 3c ldur q4, [x29, #-96]
ec: 02 e4 00 6f movi v2.2d, #0000000000000000
f0: e8 03 1f aa mov x8, xzr
f4: 03 e4 00 6f movi v3.2d, #0000000000000000
f8: eb ab 41 ad ldp q11, q10, [sp, #48]
fc: 80 04 04 6e mov v0.s[0], v4.s[0]
100: 01 e4 00 6f movi v1.2d, #0000000000000000
104: 05 e4 00 6f movi v5.2d, #0000000000000000
108: ff 93 42 ad ldp q31, q4, [sp, #80]
10c: 82 04 04 6e mov v2.s[0], v4.s[0]
110: a4 03 db 3c ldur q4, [x29, #-80]
114: ed b3 40 ad ldp q13, q12, [sp, #16]
118: 83 04 04 6e mov v3.s[0], v4.s[0]
11c: 04 e4 00 6f movi v4.2d, #0000000000000000
120: 69 02 08 8b add x9, x19, x8
124: 08 01 02 91 add x8, x8, #128
128: 67 1d ab 4e mov v7.16b, v11.16b
12c: 1f 05 40 f1 cmp x8, #1, lsl #12
130: 7b 1d ab 4e mov v27.16b, v11.16b
134: 30 09 df 4c ld4 { v16.4s, v17.4s, v18.4s, v19.4s }, [x9], #64
138: 06 d6 aa 4e fsub v6.4s, v16.4s, v10.4s
13c: 38 d6 ac 4e fsub v24.4s, v17.4s, v12.4s
140: 59 d6 ad 4e fsub v25.4s, v18.4s, v13.4s
144: c7 cc 26 4e fmla v7.4s, v6.4s, v6.4s
148: 07 cf 38 4e fmla v7.4s, v24.4s, v24.4s
14c: 34 09 40 4c ld4 { v20.4s, v21.4s, v22.4s, v23.4s }, [x9]
150: 27 cf 39 4e fmla v7.4s, v25.4s, v25.4s
154: e7 f8 a1 6e fsqrt v7.4s, v7.4s
158: 9a d6 aa 4e fsub v26.4s, v20.4s, v10.4s
15c: bc d6 ac 4e fsub v28.4s, v21.4s, v12.4s
160: dd d6 ad 4e fsub v29.4s, v22.4s, v13.4s
164: 5b cf 3a 4e fmla v27.4s, v26.4s, v26.4s
168: e7 ff 27 6e fdiv v7.4s, v31.4s, v7.4s
16c: 9b cf 3c 4e fmla v27.4s, v28.4s, v28.4s
170: bb cf 3d 4e fmla v27.4s, v29.4s, v29.4s
174: 7b fb a1 6e fsqrt v27.4s, v27.4s
178: fe dc 27 6e fmul v30.4s, v7.4s, v7.4s
17c: 67 de 27 6e fmul v7.4s, v19.4s, v7.4s
180: c7 df 27 6e fmul v7.4s, v30.4s, v7.4s
184: fb ff 3b 6e fdiv v27.4s, v31.4s, v27.4s
188: e3 cc 26 4e fmla v3.4s, v7.4s, v6.4s
18c: e2 cc 38 4e fmla v2.4s, v7.4s, v24.4s
190: e0 cc 39 4e fmla v0.4s, v7.4s, v25.4s
194: 70 df 3b 6e fmul v16.4s, v27.4s, v27.4s
198: f1 de 3b 6e fmul v17.4s, v23.4s, v27.4s
19c: 10 de 31 6e fmul v16.4s, v16.4s, v17.4s
1a0: 05 ce 3a 4e fmla v5.4s, v16.4s, v26.4s
1a4: 04 ce 3c 4e fmla v4.4s, v16.4s, v28.4s
1a8: 01 ce 3d 4e fmla v1.4s, v16.4s, v29.4s
1ac: a1 fb ff 54 b.ne 0x120 <simplified_nbody+0x120>
1b0: a3 d4 23 4e fadd v3.4s, v5.4s, v3.4s
1b4: 20 d4 20 4e fadd v0.4s, v1.4s, v0.4s
1b8: 82 d4 22 4e fadd v2.4s, v4.4s, v2.4s
1bc: 61 d4 20 6e faddp v1.4s, v3.4s, v0.4s
1c0: 42 d4 20 6e faddp v2.4s, v2.4s, v0.4s
1c4: 00 d4 20 6e faddp v0.4s, v0.4s, v0.4s
1c8: 21 d8 30 7e faddp s1, v1.2s
1cc: 00 d8 30 7e faddp s0, v0.2s
1d0: a0 07 3d ad stp q0, q1, [x29, #-96]
1d4: 41 d8 30 7e faddp s1, v2.2s
1d8: e1 1b 80 3d str q1, [sp, #96]
1dc: 80 02 3f d6 blr x20
1e0: 5a 03 04 11 add w26, w26, #256
1e4: 7b 07 00 11 add w27, w27, #1
1e8: 5f 03 19 6b cmp w26, w25
1ec: 23 f7 ff 54 b.lo 0xd0 <simplified_nbody+0xd0>
1f0: a2 07 7d ad ldp q2, q1, [x29, #-96]
1f4: e0 1b c0 3d ldr q0, [sp, #96]
1f8: e3 07 40 a9 ldp x3, x1, [sp]
1fc: 01 04 0c 6e mov v1.s[1], v0.s[0]
200: 03 00 00 14 b 0x20c <simplified_nbody+0x20c>
204: 01 e4 00 2f movi d1, #0000000000000000
208: 02 e4 00 2f movi d2, #0000000000000000
20c: e8 ce 97 52 mov w8, #48759
210: 60 00 40 bd ldr s0, [x3]
214: e8 ef a7 72 movk w8, #16255, lsl #16
218: 29 10 17 8b add x9, x1, x23, lsl #4
21c: f4 4f 52 a9 ldp x20, x19, [sp, #288]
220: 28 10 80 0f fmla v8.2s, v1.2s, v0.s[0]
224: 02 24 02 1f fmadd s2, s0, s2, s9
228: 01 0d 04 0e dup v1.2s, w8
22c: 03 01 27 1e fmov s3, w8
230: 24 01 40 fd ldr d4, [x9]
234: fa 67 4f a9 ldp x26, x25, [sp, #240]
238: 42 08 23 1e fmul s2, s2, s3
23c: 01 dd 21 2e fmul v1.2s, v8.2s, v1.2s
240: 23 09 40 bd ldr s3, [x9, #8]
244: fc 6f 4e a9 ldp x28, x27, [sp, #224]
248: 02 03 00 bd str s2, [x24]
24c: 24 10 80 0f fmla v4.2s, v1.2s, v0.s[0]
250: 40 0c 00 1f fmadd s0, s2, s0, s3
254: c1 02 00 fd str d1, [x22]
258: f6 57 51 a9 ldp x22, x21, [sp, #272]
25c: f8 5f 50 a9 ldp x24, x23, [sp, #256]
260: 24 01 00 fd str d4, [x9]
264: fd 7b 4d a9 ldp x29, x30, [sp, #208]
268: 20 09 00 bd str s0, [x9, #8]
26c: e9 23 4c 6d ldp d9, d8, [sp, #192]
270: eb 2b 4b 6d ldp d11, d10, [sp, #176]
274: ed 33 4a 6d ldp d13, d12, [sp, #160]
278: ff c3 04 91 add sp, sp, #304
27c: c0 03 5f d6 ret
Metal / AIR
Note that the compiler would usually directly output a .metallib file. The output below comes from disassembling it withmetallib-dis
(provided by the toolchain).
; ModuleID = 'bc_module'
source_filename = "simplified_nbody.cpp"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx11.0.0"
%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }
@_ZZ16simplified_nbodyE20local_body_positions = internal addrspace(3) unnamed_addr global [256 x %class.vector4] undef, align 16
; Function Attrs: convergent nounwind
define void @simplified_nbody(%class.vector4 addrspace(1)* noalias nocapture readonly, %class.vector4 addrspace(1)* noalias nocapture, %class.vector3 addrspace(1)* noalias nocapture, float addrspace(2)* noalias nocapture readonly dereferenceable(4), <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, i32, i32, i32, i32) local_unnamed_addr #0 !reqd_work_group_size !33 {
%15 = extractelement <3 x i32> %4, i32 0
%16 = zext i32 %15 to i64
%17 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 0
%18 = load float, float addrspace(1)* %17, align 4
%19 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 1
%20 = load float, float addrspace(1)* %19, align 4
%21 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 2
%22 = load float, float addrspace(1)* %21, align 4
%23 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 0
%24 = load float, float addrspace(1)* %23, align 4
%25 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 1
%26 = load float, float addrspace(1)* %25, align 4
%27 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 2
%28 = load float, float addrspace(1)* %27, align 4
%29 = extractelement <3 x i32> %5, i32 0
%30 = extractelement <3 x i32> %6, i32 0
%31 = zext i32 %30 to i64
%32 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 0
%33 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 1
%34 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 2
%35 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 3
br label %59
; <label>:36: ; preds = %76
%37 = load float, float addrspace(2)* %3, align 4
%38 = fmul fast float %37, %110
%39 = fmul fast float %37, %111
%40 = fmul fast float %37, %112
%41 = fadd fast float %38, %24
%42 = fadd fast float %39, %26
%43 = fadd fast float %40, %28
%44 = fmul fast float %41, 0x3FEFF7CEE0000000
%45 = fmul fast float %42, 0x3FEFF7CEE0000000
%46 = fmul fast float %43, 0x3FEFF7CEE0000000
%47 = fmul fast float %44, %37
%48 = fmul fast float %45, %37
%49 = fmul fast float %46, %37
%50 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 0
%51 = load float, float addrspace(1)* %50, align 4, !tbaa !34
%52 = fadd fast float %47, %51
store float %52, float addrspace(1)* %50, align 4, !tbaa !34
%53 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 1
%54 = load float, float addrspace(1)* %53, align 4, !tbaa !34
%55 = fadd fast float %48, %54
store float %55, float addrspace(1)* %53, align 4, !tbaa !34
%56 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 2
%57 = load float, float addrspace(1)* %56, align 4, !tbaa !34
%58 = fadd fast float %57, %49
store float %58, float addrspace(1)* %56, align 4, !tbaa !34
store float %44, float addrspace(1)* %23, align 4, !tbaa !34
store float %45, float addrspace(1)* %25, align 4, !tbaa !34
store float %46, float addrspace(1)* %27, align 4, !tbaa !34
ret void
; <label>:59: ; preds = %76, %14
%60 = phi i32 [ 0, %14 ], [ %77, %76 ]
%61 = phi i32 [ 0, %14 ], [ %78, %76 ]
%62 = phi float [ 0.000000e+00, %14 ], [ %112, %76 ]
%63 = phi float [ 0.000000e+00, %14 ], [ %111, %76 ]
%64 = phi float [ 0.000000e+00, %14 ], [ %110, %76 ]
%65 = shl i32 %61, 8
%66 = add i32 %30, %65
%67 = zext i32 %66 to i64
%68 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 0
%69 = load float, float addrspace(1)* %68, align 4
%70 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 1
%71 = load float, float addrspace(1)* %70, align 4
%72 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 2
%73 = load float, float addrspace(1)* %72, align 4
%74 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 3
%75 = load float, float addrspace(1)* %74, align 4
store float %69, float addrspace(3)* %32, align 4, !tbaa !34
store float %71, float addrspace(3)* %33, align 4, !tbaa !34
store float %73, float addrspace(3)* %34, align 4, !tbaa !34
store float %75, float addrspace(3)* %35, align 4, !tbaa !34
tail call void @air.wg.barrier(i32 2, i32 1) #3
br label %80
; <label>:76: ; preds = %80
tail call void @air.wg.barrier(i32 2, i32 1) #3
%77 = add i32 %60, 256
%78 = add i32 %61, 1
%79 = icmp ult i32 %77, %29
br i1 %79, label %59, label %36
; <label>:80: ; preds = %80, %59
%81 = phi i32 [ 0, %59 ], [ %113, %80 ]
%82 = phi float [ %62, %59 ], [ %112, %80 ]
%83 = phi float [ %63, %59 ], [ %111, %80 ]
%84 = phi float [ %64, %59 ], [ %110, %80 ]
%85 = zext i32 %81 to i64
%86 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 0
%87 = load float, float addrspace(3)* %86, align 4
%88 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 1
%89 = load float, float addrspace(3)* %88, align 4
%90 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 2
%91 = load float, float addrspace(3)* %90, align 4
%92 = fsub fast float %87, %18
%93 = fsub fast float %89, %20
%94 = fsub fast float %91, %22
%95 = fmul fast float %92, %92
%96 = fmul fast float %93, %93
%97 = fmul fast float %94, %94
%98 = fadd fast float %95, 0x3F1A36E2E0000000
%99 = fadd fast float %98, %96
%100 = fadd fast float %99, %97
%101 = tail call fast float @air.fast_rsqrt.f32(float %100) #4
%102 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 3
%103 = load float, float addrspace(3)* %102, align 4, !tbaa !34
%104 = fmul fast float %101, %101
%105 = fmul fast float %104, %101
%106 = fmul fast float %105, %103
%107 = fmul fast float %106, %92
%108 = fmul fast float %106, %93
%109 = fmul fast float %106, %94
%110 = fadd fast float %107, %84
%111 = fadd fast float %108, %83
%112 = fadd fast float %109, %82
%113 = add nuw nsw i32 %81, 1
%114 = icmp eq i32 %113, 256
br i1 %114, label %76, label %80
}
; Function Attrs: convergent nounwind readnone
declare float @air.fast_rsqrt.f32(float) local_unnamed_addr #1
; Function Attrs: convergent noduplicate
declare void @air.wg.barrier(i32, i32) local_unnamed_addr #2
attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #2 = { convergent noduplicate "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #3 = { convergent noduplicate nounwind }
attributes #4 = { convergent nounwind readnone }
!air.kernel = !{!0}
!air.version = !{!18}
!air.language_version = !{!19}
!air.compile_options = !{!20, !21, !22}
!air.source_file_name = !{!23}
!llvm.module.flags = !{!24, !25, !26, !27, !28, !29, !30, !31}
!llvm.ident = !{!32}
!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float addrspace(2)*, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, i32, i32, i32, i32)* @simplified_nbody, !1, !2, !17}
!1 = !{}
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16}
!3 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read", !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"const vector4<float>", !"air.arg_name", !"in_positions"}
!4 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"vector4<float>", !"air.arg_name", !"out_positions"}
!5 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.arg_type_size", i32 12, !"air.arg_type_align_size", i32 12, !"air.arg_type_name", !"vector3<float>", !"air.arg_name", !"inout_velocities"}
!6 = !{i32 3, !"air.buffer", !"air.buffer_size", i32 4, !"air.location_index", i32 3, i32 1, !"air.read", !"air.arg_type_size", i32 4, !"air.arg_type_align_size", i32 4, !"air.arg_type_name", !"float", !"air.arg_name", !"time_delta"}
!7 = !{i32 4, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_id__"}
!8 = !{i32 5, !"air.threads_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_size__"}
!9 = !{i32 6, !"air.thread_position_in_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_id__"}
!10 = !{i32 7, !"air.threads_per_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_size__"}
!11 = !{i32 8, !"air.threadgroup_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_id__"}
!12 = !{i32 9, !"air.threadgroups_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_size__"}
!13 = !{i32 10, !"air.simdgroup_index_in_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_id__"}
!14 = !{i32 11, !"air.thread_index_in_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_local_id__"}
!15 = !{i32 12, !"air.threads_per_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_size__"}
!16 = !{i32 13, !"air.simdgroups_per_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__num_sub_groups__"}
!17 = !{!"air.max_work_group_size", i32 256}
!18 = !{i32 2, i32 3, i32 0}
!19 = !{!"Metal", i32 2, i32 3, i32 0}
!20 = !{!"air.compile.denorms_disable"}
!21 = !{!"air.compile.fast_math_enable"}
!22 = !{!"air.compile.framebuffer_fetch_enable"}
!23 = !{!"simplified_nbody.cpp"}
!24 = !{i32 7, !"air.max_device_buffers", i32 31}
!25 = !{i32 7, !"air.max_constant_buffers", i32 31}
!26 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!27 = !{i32 7, !"air.max_textures", i32 128}
!28 = !{i32 7, !"air.max_read_write_textures", i32 8}
!29 = !{i32 7, !"air.max_samplers", i32 16}
!30 = !{i32 1, !"wchar_size", i32 4}
!31 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 0]}
!32 = !{!"Apple LLVM version 31001.143 (metalfe-31001.143)"}
!33 = !{i32 256, i32 1, i32 1}
!34 = !{!35, !35, i64 0}
!35 = !{!"omnipotent char", !36, i64 0}
!36 = !{!"Simple C++ TBAA"}
OpenCL / SPIR
Note that the compiler would usually directly output a .bc file. The output below comes from disassembling it withllvm-dis
(provided by the toolchain). Also note that the bitcode file is exported in a LLVM 3.2 / SPIR 1.2 compatible format, but the output below uses LLVM 8.0 syntax.
; ModuleID = 'spir.bc'
source_filename = "spir.bc"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
target triple = "spir64-unknown-unknown"
%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }
@simplified_nbody.local_body_positions = internal unnamed_addr addrspace(3) global [256 x %class.vector4] zeroinitializer, align 4
define floor_kernel void @simplified_nbody(%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float) {
%5 = tail call floor_func i64 @_Z13get_global_idj(i32 0), !range !13
%6 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 0
%7 = load float, float addrspace(1)* %6, align 4
%8 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 1
%9 = load float, float addrspace(1)* %8, align 4
%10 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 2
%11 = load float, float addrspace(1)* %10, align 4
%12 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 0
%13 = load float, float addrspace(1)* %12, align 4
%14 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 1
%15 = load float, float addrspace(1)* %14, align 4
%16 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 2
%17 = load float, float addrspace(1)* %16, align 4
%18 = tail call floor_func i64 @_Z15get_global_sizej(i32 0), !range !14
%19 = trunc i64 %18 to i32, !range !15
%20 = tail call floor_func i64 @_Z12get_local_idj(i32 0), !range !16
%21 = trunc i64 %20 to i32, !range !17
%22 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 0
%23 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 1
%24 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 2
%25 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 3
br label %48
; <label>:26: ; preds = %65
%27 = fmul float %98, %3
%28 = fmul float %99, %3
%29 = fmul float %100, %3
%30 = fadd float %27, %13
%31 = fadd float %28, %15
%32 = fadd float %29, %17
%33 = fmul float %30, 0x3FEFF7CEE0000000
%34 = fmul float %31, 0x3FEFF7CEE0000000
%35 = fmul float %32, 0x3FEFF7CEE0000000
%36 = fmul float %33, %3
%37 = fmul float %34, %3
%38 = fmul float %35, %3
%39 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 0
%40 = load float, float addrspace(1)* %39, align 4, !tbaa !18
%41 = fadd float %40, %36
store float %41, float addrspace(1)* %39, align 4, !tbaa !18
%42 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 1
%43 = load float, float addrspace(1)* %42, align 4, !tbaa !18
%44 = fadd float %43, %37
store float %44, float addrspace(1)* %42, align 4, !tbaa !18
%45 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 2
%46 = load float, float addrspace(1)* %45, align 4, !tbaa !18
%47 = fadd float %46, %38
store float %47, float addrspace(1)* %45, align 4, !tbaa !18
store float %33, float addrspace(1)* %12, align 4, !tbaa !18
store float %34, float addrspace(1)* %14, align 4, !tbaa !18
store float %35, float addrspace(1)* %16, align 4, !tbaa !18
ret void
; <label>:48: ; preds = %65, %4
%49 = phi i32 [ 0, %4 ], [ %66, %65 ]
%50 = phi i32 [ 0, %4 ], [ %67, %65 ]
%51 = phi float [ 0.000000e+00, %4 ], [ %100, %65 ]
%52 = phi float [ 0.000000e+00, %4 ], [ %99, %65 ]
%53 = phi float [ 0.000000e+00, %4 ], [ %98, %65 ]
%54 = shl i32 %50, 8
%55 = add i32 %54, %21
%56 = zext i32 %55 to i64
%57 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 0
%58 = load float, float addrspace(1)* %57, align 4
%59 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 1
%60 = load float, float addrspace(1)* %59, align 4
%61 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 2
%62 = load float, float addrspace(1)* %61, align 4
%63 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 3
%64 = load float, float addrspace(1)* %63, align 4
store float %58, float addrspace(3)* %22, align 4, !tbaa !18
store float %60, float addrspace(3)* %23, align 4, !tbaa !18
store float %62, float addrspace(3)* %24, align 4, !tbaa !18
store float %64, float addrspace(3)* %25, align 4, !tbaa !18
tail call floor_func void @_Z7barrierj(i32 1)
br label %69
; <label>:65: ; preds = %69
tail call floor_func void @_Z7barrierj(i32 1)
%66 = add i32 %49, 256
%67 = add i32 %50, 1
%68 = icmp ult i32 %66, %19
br i1 %68, label %48, label %26
; <label>:69: ; preds = %69, %48
%70 = phi i64 [ 0, %48 ], [ %101, %69 ]
%71 = phi float [ %51, %48 ], [ %100, %69 ]
%72 = phi float [ %52, %48 ], [ %99, %69 ]
%73 = phi float [ %53, %48 ], [ %98, %69 ]
%74 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 0
%75 = load float, float addrspace(3)* %74, align 4
%76 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 1
%77 = load float, float addrspace(3)* %76, align 4
%78 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 2
%79 = load float, float addrspace(3)* %78, align 4
%80 = fsub float %75, %7
%81 = fsub float %77, %9
%82 = fsub float %79, %11
%83 = fmul float %80, %80
%84 = fmul float %81, %81
%85 = fmul float %82, %82
%86 = fadd float %83, 0x3F1A36E2E0000000
%87 = fadd float %86, %84
%88 = fadd float %87, %85
%89 = tail call floor_func float @_Z5rsqrtf(float %88)
%90 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 3
%91 = load float, float addrspace(3)* %90, align 4, !tbaa !18
%92 = fmul float %89, %89
%93 = fmul float %92, %89
%94 = fmul float %93, %91
%95 = fmul float %94, %80
%96 = fmul float %94, %81
%97 = fmul float %94, %82
%98 = fadd float %95, %73
%99 = fadd float %96, %72
%100 = fadd float %97, %71
%101 = add nuw nsw i64 %70, 1
%102 = icmp eq i64 %101, 256
br i1 %102, label %65, label %69
}
declare floor_func i64 @_Z13get_global_idj(i32)
declare floor_func i64 @_Z15get_global_sizej(i32)
declare floor_func i64 @_Z12get_local_idj(i32)
declare floor_func float @_Z5rsqrtf(float)
declare floor_func void @_Z7barrierj(i32)
!opencl.kernels = !{!0}
!llvm.module.flags = !{!7}
!opencl.ocl.version = !{!8}
!opencl.spir.version = !{!8}
!opencl.enable.FP_CONTRACT = !{}
!opencl.used.extensions = !{!9}
!opencl.used.optional.core.features = !{!10}
!opencl.compiler.options = !{!11}
!llvm.ident = !{!12}
!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float)* @simplified_nbody, !1, !2, !3, !4, !5, !6}
!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0}
!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
!3 = !{!"kernel_arg_type", !"compute_global_buffer<const float4>", !"compute_global_buffer<float4>", !"compute_global_buffer<float3>", !"param<float>"}
!4 = !{!"kernel_arg_base_type", !"struct __class vector4<float>*", !"struct __class vector4<float>*", !"struct __class vector3<float>*", !"float"}
!5 = !{!"kernel_arg_type_qual", !"restrict const", !"restrict", !"restrict", !"const"}
!6 = !{!"kernel_arg_name", !"in_positions", !"out_positions", !"inout_velocities", !"time_delta"}
!7 = !{i32 1, !"wchar_size", i32 4}
!8 = !{i32 1, i32 2}
!9 = !{!"cl_khr_byte_addressable_store", !"cl_khr_fp16", !"cl_khr_fp64", !"cl_khr_global_int32_base_atomics", !"cl_khr_global_int32_extended_atomics", !"cl_khr_local_int32_base_atomics", !"cl_khr_local_int32_extended_atomics", !"cl_khr_gl_msaa_sharing", !"cl_khr_mipmap_image", !"cl_khr_mipmap_image_writes"}
!10 = !{!"cl_doubles"}
!11 = !{!"-cl-kernel-arg-info", !"-cl-mad-enable", !"-cl-denorms-are-zero", !"-cl-unsafe-math-optimizations"}
!12 = !{!"clang version 8.0.0 (ssh://a2git/clang_bleeding_edge.git c39607838f2b421540b8e9ddf71e03101218afc2) (ssh://a2git/llvm_bleeding_edge.git 27830df56091d37ab3a605462417856d2d382d6d)"}
!13 = !{i64 0, i64 4294967295}
!14 = !{i64 1, i64 4294967295}
!15 = !{i32 1, i32 -1}
!16 = !{i64 0, i64 2048}
!17 = !{i32 0, i32 2048}
!18 = !{!19, !19, i64 0}
!19 = !{!"omnipotent char", !20, i64 0}
!20 = !{!"Simple C++ TBAA"}
OpenCL / SPIR-V
Note that the compiler would usually directly output a .spvc file (a simple container format for multiple SPIR-V binaries). The output below comes from disassembling it withspirv-dis
(provided by the toolchain).
Also note that the output below has been generated with extended readability (--debug-asm).
; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 158
; Schema: 0
Capability Addresses
Capability Linkage
Capability Kernel
Capability Float16
Capability Int64
Capability ImageBasic
Capability ImageMipmap
Extension "cl_khr_3d_image_writes"
Extension "cl_khr_byte_addressable_store"
Extension "cl_khr_fp16"
Extension "cl_khr_fp64"
Extension "cl_khr_gl_msaa_sharing"
Extension "cl_khr_global_int32_base_atomics"
Extension "cl_khr_global_int32_extended_atomics"
Extension "cl_khr_local_int32_base_atomics"
Extension "cl_khr_local_int32_extended_atomics"
Extension "cl_khr_mipmap_image"
Extension "cl_khr_mipmap_image_writes"
%1 = ExtInstImport "OpenCL.std"
MemoryModel Physical64 OpenCL
EntryPoint Kernel %simplified_nbody "simplified_nbody" %simplified_nbody.local_body_positions
Source OpenCL_CPP 202000
Decorate %in_positions FuncParamAttr NoAlias
Decorate %out_positions FuncParamAttr NoAlias
Decorate %inout_velocities FuncParamAttr NoAlias
Decorate %in_positions FuncParamAttr NoCapture
Decorate %out_positions FuncParamAttr NoCapture
Decorate %inout_velocities FuncParamAttr NoCapture
Decorate %in_positions FuncParamAttr NoWrite
Decorate %time_delta FuncParamAttr NoWrite
Decorate %_Z7barrierj LinkageAttributes "_Z7barrierj" Import
Decorate %_Z5rsqrtf LinkageAttributes "_Z5rsqrtf" Import
Decorate %_Z12get_local_idj LinkageAttributes "_Z12get_local_idj" Import
Decorate %_Z13get_global_idj LinkageAttributes "_Z13get_global_idj" Import
Decorate %_Z15get_global_sizej LinkageAttributes "_Z15get_global_sizej" Import
Decorate %simplified_nbody.local_body_positions Alignment 4
%ulong = TypeInt 64 0
%uint = TypeInt 32 0
%256ul = Constant %ulong 256
%0u = Constant %uint 0
%1u = Constant %uint 1
%2u = Constant %uint 2
%0ul = Constant %ulong 0
%3u = Constant %uint 3
%8u = Constant %uint 8
%1ul = Constant %ulong 1
%256u = Constant %uint 256
%float = TypeFloat 32
%struct.anon = TypeStruct %float %float %float %float
%union.anon = TypeStruct %struct.anon
%class.vector4 = TypeStruct %union.anon
%class.vector4[256ul] = TypeArray %class.vector4 %256ul
%(Workgroup)class.vector4[256ul]* = TypePointer Workgroup %class.vector4[256ul]
%ulong(#1) = TypeFunction %ulong %uint
%float(#1) = TypeFunction %float %float
%void = TypeVoid
%void(#1) = TypeFunction %void %uint
%(CrossWorkgroup)class.vector4* = TypePointer CrossWorkgroup %class.vector4
%struct.anon.9 = TypeStruct %float %float %float
%union.anon.8 = TypeStruct %struct.anon.9
%class.vector3 = TypeStruct %union.anon.8
%(CrossWorkgroup)class.vector3* = TypePointer CrossWorkgroup %class.vector3
%void(#4) = TypeFunction %void %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector3* %float
%(CrossWorkgroup)float* = TypePointer CrossWorkgroup %float
%(Workgroup)float* = TypePointer Workgroup %float
%bool = TypeBool
%0.0f = Constant %float 0
%9.99999975e-05f = Constant %float 9.99999975e-05
%0.999000013f = Constant %float 0.999000013
%simplified_nbody.local_body_positions = Variable %(Workgroup)class.vector4[256ul]* Workgroup
function ulong _Z13get_global_idj ( %ulong(#1) ) Pure {
%14 = FunctionParameter %uint
}
function ulong _Z15get_global_sizej ( %ulong(#1) ) Pure {
%16 = FunctionParameter %uint
}
function ulong _Z12get_local_idj ( %ulong(#1) ) Pure {
%18 = FunctionParameter %uint
}
function float _Z5rsqrtf ( %float(#1) ) Pure {
%21 = FunctionParameter %float
}
function void _Z7barrierj ( %void(#1) ) {
%25 = FunctionParameter %uint
}
function void simplified_nbody ( %void(#4) ) {
%in_positions = FunctionParameter %(CrossWorkgroup)class.vector4*
%out_positions = FunctionParameter %(CrossWorkgroup)class.vector4*
%inout_velocities = FunctionParameter %(CrossWorkgroup)class.vector3*
%time_delta = FunctionParameter %float
37:
%39 = FunctionCall %ulong %_Z13get_global_idj %0u
%41 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %39 %0u %0u %0u
%42 = Load %float %41 Aligned 4
%44 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %39 %0u %0u %1u
%45 = Load %float %44 Aligned 4
%47 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %39 %0u %0u %2u
%48 = Load %float %47 Aligned 4
%49 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %inout_velocities %39 %0u %0u %0u
%50 = Load %float %49 Aligned 4
%51 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %inout_velocities %39 %0u %0u %1u
%52 = Load %float %51 Aligned 4
%53 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %inout_velocities %39 %0u %0u %2u
%54 = Load %float %53 Aligned 4
%55 = FunctionCall %ulong %_Z15get_global_sizej %0u
%56 = UConvert %uint %55
%57 = FunctionCall %ulong %_Z12get_local_idj %0u
%58 = UConvert %uint %57
%61 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %0u
%62 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %1u
%63 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %2u
%65 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %3u
Branch %66
66:
%69 = Phi %uint ( %0u <- %37, %67 <- %68 )
%71 = Phi %uint ( %0u <- %37, %70 <- %68 )
%74 = Phi %float ( %0.0f <- %37, %73 <- %68 )
%76 = Phi %float ( %0.0f <- %37, %75 <- %68 )
%78 = Phi %float ( %0.0f <- %37, %77 <- %68 )
%80 = ShiftLeftLogical %uint %71 %8u
%81 = IAdd %uint %80 %58
%82 = UConvert %ulong %81
%83 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %0u
%84 = Load %float %83 Aligned 4
%85 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %1u
%86 = Load %float %85 Aligned 4
%87 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %2u
%88 = Load %float %87 Aligned 4
%89 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %3u
%90 = Load %float %89 Aligned 4
Store %61 %84 Aligned 4
Store %62 %86 Aligned 4
Store %63 %88 Aligned 4
Store %65 %90 Aligned 4
%91 = FunctionCall %void %_Z7barrierj %1u
Branch %92
68:
%130 = FunctionCall %void %_Z7barrierj %1u
%67 = IAdd %uint %69 %256u
%70 = IAdd %uint %71 %1u
%134 = ULessThan %bool %67 %56
BranchConditional %134 %66 %135
92:
%94 = Phi %ulong ( %0ul <- %66, %93 <- %92 )
%95 = Phi %float ( %74 <- %66, %73 <- %92 )
%96 = Phi %float ( %76 <- %66, %75 <- %92 )
%97 = Phi %float ( %78 <- %66, %77 <- %92 )
%98 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %0u
%99 = Load %float %98 Aligned 4
%100 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %1u
%101 = Load %float %100 Aligned 4
%102 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %2u
%103 = Load %float %102 Aligned 4
%104 = FSub %float %99 %42
%105 = FSub %float %101 %45
%106 = FSub %float %103 %48
%107 = FMul %float %104 %104
%108 = FMul %float %105 %105
%109 = FMul %float %106 %106
%111 = FAdd %float %107 %9.99999975e-05f
%112 = FAdd %float %111 %108
%113 = FAdd %float %112 %109
%114 = FunctionCall %float %_Z5rsqrtf %113
%115 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %3u
%116 = Load %float %115 Aligned 4
%117 = FMul %float %114 %114
%118 = FMul %float %117 %114
%119 = FMul %float %118 %116
%120 = FMul %float %119 %104
%121 = FMul %float %119 %105
%122 = FMul %float %119 %106
%77 = FAdd %float %120 %97
%75 = FAdd %float %121 %96
%73 = FAdd %float %122 %95
%93 = IAdd %ulong %94 %1ul
%129 = IEqual %bool %93 %256ul
BranchConditional %129 %68 %92
135:
%136 = FMul %float %77 %time_delta
%137 = FMul %float %75 %time_delta
%138 = FMul %float %73 %time_delta
%139 = FAdd %float %136 %50
%140 = FAdd %float %137 %52
%141 = FAdd %float %138 %54
%143 = FMul %float %139 %0.999000013f
%144 = FMul %float %140 %0.999000013f
%145 = FMul %float %141 %0.999000013f
%146 = FMul %float %143 %time_delta
%147 = FMul %float %144 %time_delta
%148 = FMul %float %145 %time_delta
%149 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %out_positions %39 %0u %0u %0u
%150 = Load %float %149 Aligned 4
%151 = FAdd %float %150 %146
Store %149 %151 Aligned 4
%152 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %out_positions %39 %0u %0u %1u
%153 = Load %float %152 Aligned 4
%154 = FAdd %float %153 %147
Store %152 %154 Aligned 4
%155 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %out_positions %39 %0u %0u %2u
%156 = Load %float %155 Aligned 4
%157 = FAdd %float %156 %148
Store %155 %157 Aligned 4
Store %49 %143 Aligned 4
Store %51 %144 Aligned 4
Store %53 %145 Aligned 4
Return
Vulkan / SPIR-V
Note that the compiler would usually directly output a .spvc file (a simple container format for multiple SPIR-V binaries). The output below comes from disassembling it withspirv-dis
(provided by the toolchain).
Also note that the output below has been generated with extended readability (--debug-asm).
; SPIR-V
; Version: 1.5
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 169
; Schema: 0
Capability Matrix
Capability Shader
Capability Int64
Capability UniformBufferArrayDynamicIndexing
Capability SampledImageArrayDynamicIndexing
Capability StorageBufferArrayDynamicIndexing
Capability StorageImageArrayDynamicIndexing
Capability MultiView
Capability VariablePointersStorageBuffer
Capability VariablePointers
%1 = ExtInstImport "GLSL.std.450"
MemoryModel Logical GLSL450
EntryPoint GLCompute %simplified_nbody "simplified_nbody" %simplified_nbody.vulkan_uniform. %simplified_nbody.vulkan_uniform..1 %simplified_nbody.vulkan_uniform..2 %simplified_nbody.vulkan_uniform..3 %simplified_nbody.vulkan_builtin_input. %simplified_nbody.vulkan_builtin_input..4 %simplified_nbody.vulkan_builtin_input..5 %simplified_nbody.vulkan_builtin_input..6 %vulkan.immutable_samplers %_ZZ16simplified_nbodyE20local_body_positions
ExecutionMode %simplified_nbody LocalSize 128 1 1
Source GLSL 450
Decorate %52 SpecId 1
Decorate %53 SpecId 2
Decorate %54 SpecId 3
Decorate %enclose.class.vector4 Block
Decorate %enclose.class.vector4_0 Block
Decorate %enclose.class.vector3 Block
Decorate %enclose. Block
Decorate %class.vector3[] ArrayStride 12
Decorate %class.vector4[256l] ArrayStride 16
Decorate %class.vector4[] ArrayStride 16
Decorate %class.vector4[]_0 ArrayStride 16
Decorate %simplified_nbody.vulkan_builtin_input..6 BuiltIn NumWorkgroups
Decorate %simplified_nbody.vulkan_constant.workgroup_size BuiltIn WorkgroupSize
Decorate %simplified_nbody.vulkan_builtin_input..5 BuiltIn WorkgroupId
Decorate %simplified_nbody.vulkan_builtin_input..4 BuiltIn LocalInvocationId
Decorate %simplified_nbody.vulkan_builtin_input. BuiltIn GlobalInvocationId
Decorate %simplified_nbody.vulkan_uniform. NonWritable
Decorate %simplified_nbody.vulkan_uniform..3 NonWritable
Decorate %vulkan.immutable_samplers Binding 0
Decorate %simplified_nbody.vulkan_uniform. Binding 0
Decorate %simplified_nbody.vulkan_uniform..1 Binding 1
Decorate %simplified_nbody.vulkan_uniform..2 Binding 2
Decorate %simplified_nbody.vulkan_uniform..3 Binding 3
Decorate %vulkan.immutable_samplers DescriptorSet 0
Decorate %simplified_nbody.vulkan_uniform. DescriptorSet 1
Decorate %simplified_nbody.vulkan_uniform..1 DescriptorSet 1
Decorate %simplified_nbody.vulkan_uniform..2 DescriptorSet 1
Decorate %simplified_nbody.vulkan_uniform..3 DescriptorSet 1
MemberDecorate %class.vector4 0 Offset 0
MemberDecorate %union.anon 0 Offset 0
MemberDecorate %struct.anon 0 Offset 0
MemberDecorate %enclose.class.vector4 0 Offset 0
MemberDecorate %enclose.class.vector4_0 0 Offset 0
MemberDecorate %enclose.class.vector3 0 Offset 0
MemberDecorate %class.vector3 0 Offset 0
MemberDecorate %union.anon.8 0 Offset 0
MemberDecorate %struct.anon.9 0 Offset 0
MemberDecorate %enclose. 0 Offset 0
MemberDecorate %struct.anon 1 Offset 4
MemberDecorate %struct.anon.9 1 Offset 4
MemberDecorate %struct.anon 2 Offset 8
MemberDecorate %struct.anon.9 2 Offset 8
MemberDecorate %struct.anon 3 Offset 12
%uint = TypeInt 32 0
%ilong = TypeInt 64 1
%iint = TypeInt 32 1
%32u = Constant %uint 32
%256l = Constant %ilong 256
%52 = SpecConstant %uint 128
%53 = SpecConstant %uint 1
%54 = SpecConstant %uint 1
%0i = Constant %iint 0
%1i = Constant %iint 1
%2i = Constant %iint 2
%0l = Constant %ilong 0
%3i = Constant %iint 3
%8i = Constant %iint 8
%256i = Constant %iint 256
%32l = Constant %ilong 32
%1l = Constant %ilong 1
%Sampler = TypeSampler
%Sampler[32u] = TypeArray %Sampler %32u
%(UniformConstant)Sampler[32u]* = TypePointer UniformConstant %Sampler[32u]
%float = TypeFloat 32
%struct.anon = TypeStruct %float %float %float %float
%union.anon = TypeStruct %struct.anon
%class.vector4 = TypeStruct %union.anon
%class.vector4[256l] = TypeArray %class.vector4 %256l
%(Workgroup)class.vector4[256l]* = TypePointer Workgroup %class.vector4[256l]
%void = TypeVoid
%void() = TypeFunction %void
%class.vector4[] = TypeRuntimeArray %class.vector4
%enclose.class.vector4 = TypeStruct %class.vector4[]
%(StorageBuffer)enclose.class.vector4* = TypePointer StorageBuffer %enclose.class.vector4
%class.vector4[]_0 = TypeRuntimeArray %class.vector4
%enclose.class.vector4_0 = TypeStruct %class.vector4[]_0
%(StorageBuffer)enclose.class.vector4_0* = TypePointer StorageBuffer %enclose.class.vector4_0
%struct.anon.9 = TypeStruct %float %float %float
%union.anon.8 = TypeStruct %struct.anon.9
%class.vector3 = TypeStruct %union.anon.8
%class.vector3[] = TypeRuntimeArray %class.vector3
%enclose.class.vector3 = TypeStruct %class.vector3[]
%(StorageBuffer)enclose.class.vector3* = TypePointer StorageBuffer %enclose.class.vector3
%enclose. = TypeStruct %float
%(StorageBuffer)enclose.* = TypePointer StorageBuffer %enclose.
%<3xiint> = TypeVector %iint 3
%(Input)<3xiint>* = TypePointer Input %<3xiint>
%<3xuint> = TypeVector %uint 3
%(StorageBuffer)float* = TypePointer StorageBuffer %float
%(Workgroup)float* = TypePointer Workgroup %float
%bool = TypeBool
%simplified_nbody.vulkan_constant.workgroup_size = SpecConstantComposite %<3xuint> %52 %53 %54
%0.0f = Constant %float 0
%9.99999975e-05f = Constant %float 9.99999975e-05
%0.999000013f = Constant %float 0.999000013
%vulkan.immutable_samplers = Variable %(UniformConstant)Sampler[32u]* UniformConstant
%_ZZ16simplified_nbodyE20local_body_positions = Variable %(Workgroup)class.vector4[256l]* Workgroup
%simplified_nbody.vulkan_uniform. = Variable %(StorageBuffer)enclose.class.vector4* StorageBuffer
%simplified_nbody.vulkan_uniform..1 = Variable %(StorageBuffer)enclose.class.vector4_0* StorageBuffer
%simplified_nbody.vulkan_uniform..2 = Variable %(StorageBuffer)enclose.class.vector3* StorageBuffer
%simplified_nbody.vulkan_uniform..3 = Variable %(StorageBuffer)enclose.* StorageBuffer
%simplified_nbody.vulkan_builtin_input. = Variable %(Input)<3xiint>* Input
%simplified_nbody.vulkan_builtin_input..4 = Variable %(Input)<3xiint>* Input
%simplified_nbody.vulkan_builtin_input..5 = Variable %(Input)<3xiint>* Input
%simplified_nbody.vulkan_builtin_input..6 = Variable %(Input)<3xiint>* Input
function void simplified_nbody ( %void() ) {
45:
%56 = Bitcast %<3xiint> %simplified_nbody.vulkan_constant.workgroup_size
%57 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input. Aligned 16
%58 = CompositeExtract %iint %57 0
%59 = SConvert %ilong %58
%62 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %59 %0i %0i %0i
%63 = Load %float %62 Aligned 4
%65 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %59 %0i %0i %1i
%66 = Load %float %65 Aligned 4
%68 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %59 %0i %0i %2i
%69 = Load %float %68 Aligned 4
%70 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %59 %0i %0i %0i
%71 = Load %float %70 Aligned 4
%72 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %59 %0i %0i %1i
%73 = Load %float %72 Aligned 4
%74 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %59 %0i %0i %2i
%75 = Load %float %74 Aligned 4
%76 = CompositeExtract %iint %56 0
%77 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input..6 Aligned 16
%78 = CompositeExtract %iint %77 0
%79 = IMul %iint %76 %78
%80 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input..4 Aligned 16
%81 = CompositeExtract %iint %80 0
%82 = SConvert %ilong %81
%85 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %0i
%86 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %1i
%87 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %2i
%89 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %3i
Branch %46
46:
%91 = Phi %iint ( %0i <- %45, %90 <- %49 )
%93 = Phi %iint ( %0i <- %45, %92 <- %49 )
%96 = Phi %float ( %0.0f <- %45, %95 <- %49 )
%98 = Phi %float ( %0.0f <- %45, %97 <- %49 )
%100 = Phi %float ( %0.0f <- %45, %99 <- %49 )
%102 = ShiftLeftLogical %iint %93 %8i
%103 = IAdd %iint %81 %102
%104 = SConvert %ilong %103
%105 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %0i
%106 = Load %float %105 Aligned 4
%107 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %1i
%108 = Load %float %107 Aligned 4
%109 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %2i
%110 = Load %float %109 Aligned 4
%111 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %3i
%112 = Load %float %111 Aligned 4
Store %85 %106 Aligned 4
Store %86 %108 Aligned 4
Store %87 %110 Aligned 4
Store %89 %112 Aligned 4
ControlBarrier %2i %2i %256i
LoopMerge %50 %49 None
Branch %47
47:
%115 = Phi %ilong ( %0l <- %46, %114 <- %48 )
%116 = Phi %float ( %96 <- %46, %95 <- %48 )
%117 = Phi %float ( %98 <- %46, %97 <- %48 )
%118 = Phi %float ( %100 <- %46, %99 <- %48 )
LoopMerge %49 %48 None
Branch %48
48:
%120 = ShiftLeftLogical %ilong %115 %32l
%121 = ShiftRightArithmetic %ilong %120 %32l
%122 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %0i
%123 = Load %float %122 Aligned 4
%124 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %1i
%125 = Load %float %124 Aligned 4
%126 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %2i
%127 = Load %float %126 Aligned 4
%128 = FSub %float %123 %63
%129 = FSub %float %125 %66
%130 = FSub %float %127 %69
%132 = ExtInst %float %1 Fma %128 %128 %9.99999975e-05f
%133 = ExtInst %float %1 Fma %129 %129 %132
%134 = ExtInst %float %1 Fma %130 %130 %133
%135 = ExtInst %float %1 InverseSqrt %134
%136 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %3i
%137 = Load %float %136 Aligned 4
%138 = FMul %float %135 %135
%139 = FMul %float %138 %135
%140 = FMul %float %139 %137
%99 = ExtInst %float %1 Fma %140 %128 %118
%97 = ExtInst %float %1 Fma %140 %129 %117
%95 = ExtInst %float %1 Fma %140 %130 %116
%114 = IAdd %ilong %115 %1l
%147 = IEqual %bool %114 %256l
BranchConditional %147 %49 %47
49:
ControlBarrier %2i %2i %256i
%90 = IAdd %iint %91 %256i
%92 = IAdd %iint %93 %1i
%150 = ULessThan %bool %90 %79
BranchConditional %150 %46 %50
50:
%151 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..3 %0l %0i
%152 = Load %float %151 Aligned 4
%153 = ExtInst %float %1 Fma %152 %99 %71
%154 = ExtInst %float %1 Fma %152 %97 %73
%155 = ExtInst %float %1 Fma %152 %95 %75
%157 = FMul %float %153 %0.999000013f
%158 = FMul %float %154 %0.999000013f
%159 = FMul %float %155 %0.999000013f
%160 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %59 %0i %0i %0i
%161 = Load %float %160 Aligned 4
%162 = ExtInst %float %1 Fma %157 %152 %161
Store %160 %162 Aligned 4
%163 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %59 %0i %0i %1i
%164 = Load %float %163 Aligned 4
%165 = ExtInst %float %1 Fma %158 %152 %164
Store %163 %165 Aligned 4
%166 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %59 %0i %0i %2i
%167 = Load %float %166 Aligned 4
%168 = ExtInst %float %1 Fma %159 %152 %167
Store %166 %168 Aligned 4
Store %70 %157 Aligned 4
Store %72 %158 Aligned 4
Store %74 %159 Aligned 4
Return
-
OS:
-
only AMD64/Intel64/ARM64 are supported
-
Windows: NT 6.1+
-
macOS: 10.13+
-
iOS: 12.0+
-
Linux: any current x64 distribution
-
other Unix: if other requirements are met
-
-
compiler/toolchain:
-
macOS/iOS: Xcode 12.5+
-
Windows (VS): VS2019 with provided clang/LLVM or Clang / LLVM for Windows
-
Windows (MinGW): MSYS2 with Clang/LLVM/libc++ 10.0+
-
libraries and optional requirements:
-
SDL2 2.0.4+
-
(opt) OpenCL: requires OpenCL 1.2+ SDK and CPU/GPU drivers (Intel, AMD)
-
(opt) CUDA: requires sm_30+/Kepler+ GPU and CUDA 9.0+ drivers (CUDA SDK not required!)
-
(opt) Metal: requires iOS 12.0+ and A7+ CPU/GPU, or macOS 10.13+ and appropriate GPU
-
(opt) Host-Compute: requires just the compiler/toolchain that is stated above
-
(opt) Vulkan: requires 1.2.142+ ICD loader / headers / SDK
-
(opt) audio/OpenAL: requires OpenAL Soft
-
(opt) VR: requires OpenVR
-
-
run
./build.sh
(use./build.sh help
to get a list of all options) -
configuration of optional parts:
-
to disable OpenCL: define
FLOOR_NO_OPENCL
or./build.sh no-opencl
-
to disable CUDA: define
FLOOR_NO_CUDA
or./build.sh no-cuda
-
to disable Metal (only affects macOS/iOS builds): define
FLOOR_NO_METAL
or./build.sh no-metal
-
to disable Host Compute: define
FLOOR_NO_HOST_COMPUTE
or./build.sh no-host-compute
-
to disable Vulkan: define
FLOOR_NO_VULKAN
or./build.sh no-vulkan
-
to disable network support (ssl/crypto/asio): define
FLOOR_NO_NET
or./build.sh no-net
-
to disable OpenAL: define
FLOOR_NO_OPENAL
or./build.sh no-openal
-
to disable VR: define
FLOOR_NO_VR
or./build.sh no-vr
-
to build with libstdc++ (GCC 10.0+) instead of libc++:
./build.sh libstdc++
-
-
open
floor.xcodeproj
and build -
some notes:
-
almost all optional parts of floor are enabled here and you’ll have to install all dependencies or disable them manually
-
Homebrew is the recommended way to install additional dependencies:
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
-
(opt) download OpenVR and manually install it:
-
mkdir -p {/usr/local/include/openvr,/usr/local/lib}
-
cp openvr/headers/* /usr/local/include/openvr/
-
cp openvr/bin/osx32/libopenvr_api.dylib /usr/local/lib/
-
-
command line tools might be necessary, install them with:
xcode-select --install
-
on iOS, either copy dependencies into your iPhoneOS and iPhoneSimulator SDK, or
floor/ios/deps/{include,lib}
-
iOS linker flags for a depending project:
-lSDL2 -lfloor -lcrypto -lssl
-
-
install Visual Studio 2019 16.9+
-
install Clang / LLVM for Windows or select clang in the VS installer
-
install Vulkan SDK
-
install vcpkg (somewhere, not within libfloor):
-
git clone https://github.com/Microsoft/vcpkg.git
-
cd vcpkg
-
.\bootstrap-vcpkg.bat -disableMetrics
-
.\vcpkg integrate install
-
-
install vcpkg packages:
-
vcpkg --triplet x64-windows install sdl2 opengl opengl-registry OpenCL vulkan openssl-windows asio openal-soft openvr
-
-
in Visual Studio: open folder
floor
(wait a little until build files are generated) -
select
Debug
orRelease
configuration and build-
NOTE: all dependencies (optional parts) are enabled here
-
-
sudo mkdir -p /opt/floor/include
-
sudo ln -sf /path/to/floor /opt/floor/include/floor
-
sudo ln -sf /path/to/floor/bin /opt/floor/lib
-
alternatively: copy these files/folders there
-
create a
%%ProgramFiles%%/floor
folder (C:/Program Files/floor) -
inside this folder:
-
create a
lib
folder -
VS2019:
-
copy everything from bin/ in there (dlls/lib/exp)
-
-
MinGW/MSYS2:
-
copy libfloor_static.a/libfloord_static.a there
-
-
create an
include
folder and copy the originalfloor
folder in there (containing all floor source code)
-
-
automated builds for Linux, macOS and Windows can be found at: https://libfloor.org/builds/toolchain
-
NOTE: this requires a Unix environment with all LLVM build dependencies installed - use MSYS2 on Windows
-
NOTE: the absolute build path must not contain spaces
-
compile the toolchain:
-
cd floor/etc/llvm140/ && ./build.sh
-
if successful, package it (in addition to a .zip file, this also creates a folder with all necessary binaries and include files):
./pkg.sh
-
-
install the toolchain:
-
Unix:
-
automatic:
-
development: run
./deploy_dev.sh
from the floor/etc/llvm140/ folder (this will create symlinks to everything in floor and floor/etc/llvm140) -
release: run
./deploy_pkg.sh
from inside the toolchain package folder (floor/etc/llvm140/toolchain_140000_*; this will copy everything)
-
-
manual:
-
copy the toolchain folder as
toolchain
to/opt/floor/
(should then be/opt/floor/toolchain/{bin,clang,libcxx}
) -
inside
/opt/floor/toolchain
, add a symlink to thefloor
include folder:sudo ln -sf ../include floor
-
-
-
Windows:
-
copy the toolchain folder as
toolchain
to%%ProgramFiles%%/floor
(should then be%%ProgramFiles%%/floor/toolchain/{bin,clang,libcxx}
) -
inside
%%ProgramFiles%%/floor/toolchain
, copy thefloor
folder from theinclude
folder above it into this folder
-
-
-
NOTE: this is the expected default setup - paths can be changed inside config.json (toolchain.generic.paths)
-
when using X11 forwarding, set these env variables:
-
export LIBGL_ALWAYS_INDIRECT=yes
-
export SDL_VIDEO_X11_NODIRECTCOLOR=yes
-
-
depending on how your Linux distribution handles OpenCL headers and library, you might need to manually install OpenCL 1.2+ compatible ones
-
floor_examples (dnn, nbody, warp, hlbvh, path tracer, other)
-
libwarp (image-space warping library)
-
oclraster (Flexible Rasterizer in OpenCL)
-
a2elight (Albion 2 Engine)
-
unibot (IRC bot)