Flo’s Open libRary

Table of Contents

What is it?
- Example
Requirements
Build Instructions
Installation
- Installation (Unix / macOS)
- Installation (Windows)
Compute/Graphics Toolchain
Misc Hints
Projects and Examples using libfloor

What is it?

This project provides a unified compute & graphics host API, as well as a unified compute & graphics C++ device language and library to enable same-source CUDA/Host/Metal/OpenCL/Vulkan programming and execution.

The unified host API is implemented at compute and graphics. All backends (CUDA/Host/Metal/OpenCL/Vulkan) currently provide compute support, while graphics support is limited to Metal and Vulkan.

To provide a unified device language, a clang/LLVM/libc++ 14.0 toolchain has been modified.

Certain parts of libfloor are used by both host and device code (math and constexpr). Additional device library code is located at device.

Advanced examples can be found in the floor_examples repository.

Example

Let’s take this fairly simple C++ kernel below that computes the body/body-interactions in a N-body simulation and compile it for each backend. Note that loop unrolling is omitted for conciseness.

// define global constants
static constexpr constant const uint32_t NBODY_TILE_SIZE { 256u };
static constexpr constant const float NBODY_DAMPING { 0.999f };
static constexpr constant const float NBODY_SOFTENING { 0.01f };
// define a kernel with a required local size of (NBODY_TILE_SIZE = 256, 1, 1)
kernel kernel_local_size(NBODY_TILE_SIZE, 1, 1)
void simplified_nbody(buffer<const float4> in_positions, // read-only global memory buffer
                      buffer<float4> out_positions, // read-write global memory buffer
                      buffer<float3> inout_velocities, // read-write global memory buffer
                      param<float> time_delta) { // read-only parameter
  // each work-item represents/computes one body
  const auto position = in_positions[global_id.x];
  auto velocity = inout_velocities[global_id.x];
  float3 acceleration; // vectors are automatically zero-initialized
  local_buffer<float4, NBODY_TILE_SIZE> local_body_positions; // local memory array allocation
  // loop over all bodies
  for (uint32_t i = 0, tile = 0, count = global_size.x; i < count; i += NBODY_TILE_SIZE, ++tile) {
    // move resp. body position/mass from global to local memory
    local_body_positions[local_id.x] = in_positions[tile * NBODY_TILE_SIZE + local_id.x];
    local_barrier(); // barrier across all work-items in this work-group
    // loop over bodies in this work-group
    for (uint32_t j = 0; j < NBODY_TILE_SIZE; ++j) {
      const auto r = local_body_positions[j].xyz - position.xyz;
      const auto dist_sq = r.dot(r) + (NBODY_SOFTENING * NBODY_SOFTENING);
      const auto inv_dist = rsqrt(dist_sq);
      const auto s = local_body_positions[j].w * (inv_dist * inv_dist * inv_dist); // .w is mass
      acceleration += r * s;
    }
    local_barrier();
  }
  velocity = (velocity + acceleration * time_delta) * NBODY_DAMPING;
  out_positions[global_id.x].xyz += velocity * time_delta; // update XYZ position
  inout_velocities[global_id.x] = velocity; // update velocity
}

click to unfold the output for each backend

CUDA / PTX

You can download the PTX file here and the CUBIN file here (note that building CUBINs is optional and requires ptxas).

//
// Generated by LLVM NVPTX Back-End
//

.version 7.2
.target sm_86
.address_size 64

// _ZZ16simplified_nbodyE20local_body_positions has been demoted
.const .align 4 .f32 _ZL13NBODY_DAMPING = 0f3F7FBE77;
                                        // -- Begin function simplified_nbody
                                        // @simplified_nbody
.visible .entry simplified_nbody(
	.param .u64 simplified_nbody_param_0,
	.param .u64 simplified_nbody_param_1,
	.param .u64 simplified_nbody_param_2,
	.param .f32 simplified_nbody_param_3
)
.reqntid 256, 1, 1
{
	.reg .pred 	%p<3>;
	.reg .f32 	%f<72>;
	.reg .b32 	%r<25>;
	.reg .b64 	%rd<18>;
	// demoted variable
	.shared .align 4 .b8 _ZZ16simplified_nbodyE20local_body_positions[4096];
// %bb.0:
	mov.u32 	%r1, %tid.x;
	mov.u32 	%r11, %ctaid.x;
	mov.u32 	%r12, %ntid.x;
	mad.lo.s32 	%r13, %r12, %r11, %r1;
	cvt.u64.u32 	%rd3, %r13;
	mul.wide.u32 	%rd7, %r13, 12;
	ld.param.u64 	%rd8, [simplified_nbody_param_2];
	cvta.to.global.u64 	%rd9, %rd8;
	add.s64 	%rd4, %rd9, %rd7;
	ld.global.f32 	%f6, [%rd4+8];
	add.s64 	%rd6, %rd4, 8;
	ld.global.f32 	%f5, [%rd4+4];
	add.s64 	%rd5, %rd4, 4;
	ld.global.f32 	%f4, [%rd4];
	mul.wide.u32 	%rd10, %r13, 16;
	ld.param.u64 	%rd11, [simplified_nbody_param_0];
	cvta.to.global.u64 	%rd2, %rd11;
	add.s64 	%rd12, %rd2, %rd10;
	ld.global.nc.f32 	%f3, [%rd12+8];
	ld.global.nc.f32 	%f2, [%rd12+4];
	ld.global.nc.f32 	%f1, [%rd12];
	mov.u32 	%r14, %nctaid.x;
	mul.lo.s32 	%r2, %r14, %r12;
	shl.b32 	%r15, %r1, 4;
	mov.u32 	%r16, _ZZ16simplified_nbodyE20local_body_positions;
	add.s32 	%r3, %r16, %r15;
	ld.param.u64 	%rd13, [simplified_nbody_param_1];
	cvta.to.global.u64 	%rd1, %rd13;
	mov.f32 	%f69, 0f00000000;
	mov.u32 	%r10, 0;
	ld.param.f32 	%f16, [simplified_nbody_param_3];
	mov.u32 	%r22, %r10;
	mov.u32 	%r23, %r10;
	mov.f32 	%f70, %f69;
	mov.f32 	%f71, %f69;
LBB0_1:                                 // =>This Loop Header: Depth=1
                                        //     Child Loop BB0_2 Depth 2
	shl.b32 	%r18, %r23, 8;
	add.s32 	%r19, %r18, %r1;
	mul.wide.u32 	%rd14, %r19, 16;
	add.s64 	%rd15, %rd2, %rd14;
	ld.global.nc.f32 	%f18, [%rd15];
	st.shared.f32 	[%r3], %f18;
	ld.global.nc.f32 	%f19, [%rd15+4];
	st.shared.f32 	[%r3+4], %f19;
	ld.global.nc.f32 	%f20, [%rd15+8];
	st.shared.f32 	[%r3+8], %f20;
	ld.global.nc.f32 	%f21, [%rd15+12];
	st.shared.f32 	[%r3+12], %f21;
	bar.sync 	0;
	mov.u32 	%r24, %r10;
LBB0_2:                                 //   Parent Loop BB0_1 Depth=1
                                        // =>  This Inner Loop Header: Depth=2
	add.s32 	%r21, %r16, %r24;
	ld.shared.f32 	%f22, [%r21+4];
	sub.f32 	%f23, %f22, %f2;
	ld.shared.f32 	%f24, [%r21];
	sub.f32 	%f25, %f24, %f1;
	fma.rn.f32 	%f26, %f25, %f25, 0f38D1B717;
	fma.rn.f32 	%f27, %f23, %f23, %f26;
	ld.shared.f32 	%f28, [%r21+8];
	sub.f32 	%f29, %f28, %f3;
	fma.rn.f32 	%f30, %f29, %f29, %f27;
	rsqrt.approx.ftz.f32 	%f31, %f30;
	mul.f32 	%f32, %f31, %f31;
	mul.f32 	%f33, %f32, %f31;
	ld.shared.f32 	%f34, [%r21+12];
	mul.f32 	%f35, %f33, %f34;
	fma.rn.f32 	%f36, %f35, %f29, %f69;
	ld.shared.f32 	%f37, [%r21+20];
	sub.f32 	%f38, %f37, %f2;
	ld.shared.f32 	%f39, [%r21+16];
	sub.f32 	%f40, %f39, %f1;
	fma.rn.f32 	%f41, %f40, %f40, 0f38D1B717;
	fma.rn.f32 	%f42, %f38, %f38, %f41;
	ld.shared.f32 	%f43, [%r21+24];
	sub.f32 	%f44, %f43, %f3;
	fma.rn.f32 	%f45, %f44, %f44, %f42;
	rsqrt.approx.ftz.f32 	%f46, %f45;
	mul.f32 	%f47, %f46, %f46;
	mul.f32 	%f48, %f47, %f46;
	ld.shared.f32 	%f49, [%r21+28];
	mul.f32 	%f50, %f48, %f49;
	fma.rn.f32 	%f69, %f50, %f44, %f36;
	fma.rn.f32 	%f51, %f35, %f23, %f70;
	fma.rn.f32 	%f70, %f50, %f38, %f51;
	fma.rn.f32 	%f52, %f35, %f25, %f71;
	fma.rn.f32 	%f71, %f50, %f40, %f52;
	add.s32 	%r24, %r24, 32;
	setp.eq.s32 	%p1, %r24, 4096;
	@%p1 bra 	LBB0_3;
	bra.uni 	LBB0_2;
LBB0_3:                                 //   in Loop: Header=BB0_1 Depth=1
	add.s32 	%r22, %r22, 256;
	setp.lt.u32 	%p2, %r22, %r2;
	bar.sync 	0;
	add.s32 	%r23, %r23, 1;
	@%p2 bra 	LBB0_1;
// %bb.4:
	fma.rn.f32 	%f53, %f71, %f16, %f4;
	ld.const.f32 	%f54, [_ZL13NBODY_DAMPING];
	mul.f32 	%f55, %f54, %f53;
	shl.b64 	%rd16, %rd3, 4;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.f32 	%f56, [%rd17];
	fma.rn.f32 	%f57, %f55, %f16, %f56;
	st.global.f32 	[%rd17], %f57;
	fma.rn.f32 	%f58, %f70, %f16, %f5;
	mul.f32 	%f59, %f54, %f58;
	ld.global.f32 	%f60, [%rd17+4];
	fma.rn.f32 	%f61, %f59, %f16, %f60;
	st.global.f32 	[%rd17+4], %f61;
	fma.rn.f32 	%f62, %f69, %f16, %f6;
	mul.f32 	%f63, %f54, %f62;
	ld.global.f32 	%f64, [%rd17+8];
	fma.rn.f32 	%f65, %f63, %f16, %f64;
	st.global.f32 	[%rd17+8], %f65;
	st.global.f32 	[%rd4], %f55;
	st.global.f32 	[%rd5], %f59;
	st.global.f32 	[%rd6], %f63;
	ret;
                                        // -- End function
}

Host-Compute (x86 CPU)

Note that the compiler would usually directly output a .bin file (ELF format). The output below comes from disassembling it with objdump -d. Also note that this has been compiled for the x86-4 target (AVX-512).

nbody.bin:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <simplified_nbody>:
   0:	55                   	push   %rbp
   1:	48 89 e5             	mov    %rsp,%rbp
   4:	41 57                	push   %r15
   6:	41 56                	push   %r14
   8:	41 55                	push   %r13
   a:	41 54                	push   %r12
   c:	53                   	push   %rbx
   d:	48 83 e4 c0          	and    $0xffffffffffffffc0,%rsp
  11:	48 81 ec 40 04 00 00 	sub    $0x440,%rsp
  18:	48 89 4c 24 50       	mov    %rcx,0x50(%rsp)
  1d:	48 89 74 24 70       	mov    %rsi,0x70(%rsp)
  22:	48 89 fb             	mov    %rdi,%rbx
  25:	48 8d 05 f9 ff ff ff 	lea    -0x7(%rip),%rax        # 25 <simplified_nbody+0x25>
  2c:	48 bf 00 00 00 00 00 	movabs $0x0,%rdi
  33:	00 00 00
  36:	48 01 c7             	add    %rax,%rdi
  39:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
  40:	00 00 00
  43:	48 8b 04 07          	mov    (%rdi,%rax,1),%rax
  47:	8b 00                	mov    (%rax),%eax
  49:	48 8d 0c 40          	lea    (%rax,%rax,2),%rcx
  4d:	48 89 c6             	mov    %rax,%rsi
  50:	48 c1 e6 04          	shl    $0x4,%rsi
  54:	48 8d 04 8a          	lea    (%rdx,%rcx,4),%rax
  58:	48 89 44 24 68       	mov    %rax,0x68(%rsp)
  5d:	c5 fa 10 04 8a       	vmovss (%rdx,%rcx,4),%xmm0
  62:	c5 fa 11 44 24 10    	vmovss %xmm0,0x10(%rsp)
  68:	c5 fa 10 44 8a 04    	vmovss 0x4(%rdx,%rcx,4),%xmm0
  6e:	c5 fa 11 44 24 14    	vmovss %xmm0,0x14(%rsp)
  74:	c5 fa 10 44 8a 08    	vmovss 0x8(%rdx,%rcx,4),%xmm0
  7a:	c5 fa 11 44 24 18    	vmovss %xmm0,0x18(%rsp)
  80:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
  87:	00 00 00
  8a:	48 8b 04 07          	mov    (%rdi,%rax,1),%rax
  8e:	8b 00                	mov    (%rax),%eax
  90:	89 44 24 1c          	mov    %eax,0x1c(%rsp)
  94:	85 c0                	test   %eax,%eax
  96:	48 89 74 24 60       	mov    %rsi,0x60(%rsp)
  9b:	48 89 7c 24 58       	mov    %rdi,0x58(%rsp)
  a0:	0f 84 44 05 00 00    	je     5ea <simplified_nbody+0x5ea>
  a6:	c5 fa 10 04 33       	vmovss (%rbx,%rsi,1),%xmm0
  ab:	c5 fa 10 4c 33 04    	vmovss 0x4(%rbx,%rsi,1),%xmm1
  b1:	c5 fa 10 54 33 08    	vmovss 0x8(%rbx,%rsi,1),%xmm2
  b7:	62 f2 7d 48 18 c0    	vbroadcastss %xmm0,%zmm0
  bd:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x3c0(%rsp)
  c4:	0f
  c5:	62 f2 7d 48 18 c1    	vbroadcastss %xmm1,%zmm0
  cb:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x380(%rsp)
  d2:	0e
  d3:	62 f2 7d 48 18 c2    	vbroadcastss %xmm2,%zmm0
  d9:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x340(%rsp)
  e0:	0d
  e1:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
  e5:	c5 f8 29 44 24 30    	vmovaps %xmm0,0x30(%rsp)
  eb:	45 31 ff             	xor    %r15d,%r15d
  ee:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
  f5:	00 00 00
  f8:	48 8b 04 07          	mov    (%rdi,%rax,1),%rax
  fc:	48 89 44 24 78       	mov    %rax,0x78(%rsp)
 101:	49 be 00 00 00 00 00 	movabs $0x0,%r14
 108:	00 00 00
 10b:	49 01 fe             	add    %rdi,%r14
 10e:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 115:	00 00 00
 118:	4c 8b 2c 07          	mov    (%rdi,%rax,1),%r13
 11c:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 123:	00 00 00
 126:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 12d:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x300(%rsp)
 134:	0c
 135:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 13c:	00 00 00
 13f:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 146:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x2c0(%rsp)
 14d:	0b
 14e:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 155:	00 00 00
 158:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 15f:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x280(%rsp)
 166:	0a
 167:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 16e:	00 00 00
 171:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 178:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x240(%rsp)
 17f:	09
 180:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 187:	00 00 00
 18a:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 191:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x200(%rsp)
 198:	08
 199:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 1a0:	00 00 00
 1a3:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 1aa:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x1c0(%rsp)
 1b1:	07
 1b2:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 1b9:	00 00 00
 1bc:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 1c3:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x180(%rsp)
 1ca:	06
 1cb:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 1d2:	00 00 00
 1d5:	62 f1 7c 48 28 04 07 	vmovaps (%rdi,%rax,1),%zmm0
 1dc:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x140(%rsp)
 1e3:	05
 1e4:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 1eb:	00 00 00
 1ee:	62 f2 7d 48 18 04 07 	vbroadcastss (%rdi,%rax,1),%zmm0
 1f5:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x100(%rsp)
 1fc:	04
 1fd:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 204:	00 00 00
 207:	62 f2 7d 48 18 04 07 	vbroadcastss (%rdi,%rax,1),%zmm0
 20e:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0xc0(%rsp)
 215:	03
 216:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 21d:	00 00 00
 220:	62 f2 7d 48 18 04 07 	vbroadcastss (%rdi,%rax,1),%zmm0
 227:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x80(%rsp)
 22e:	02
 22f:	45 31 e4             	xor    %r12d,%r12d
 232:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
 236:	c5 f8 29 44 24 20    	vmovaps %xmm0,0x20(%rsp)
 23c:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
 240:	c5 f8 29 44 24 40    	vmovaps %xmm0,0x40(%rsp)
 246:	66 2e 0f 1f 84 00 00 	cs nopw 0x0(%rax,%rax,1)
 24d:	00 00 00
 250:	44 89 e0             	mov    %r12d,%eax
 253:	c1 e0 08             	shl    $0x8,%eax
 256:	48 8b 4c 24 78       	mov    0x78(%rsp),%rcx
 25b:	8b 09                	mov    (%rcx),%ecx
 25d:	01 c8                	add    %ecx,%eax
 25f:	48 c1 e0 04          	shl    $0x4,%rax
 263:	48 c1 e1 04          	shl    $0x4,%rcx
 267:	c5 f8 10 04 03       	vmovups (%rbx,%rax,1),%xmm0
 26c:	c4 a1 78 29 04 31    	vmovaps %xmm0,(%rcx,%r14,1)
 272:	c5 f8 77             	vzeroupper
 275:	41 ff d5             	call   *%r13
 278:	c5 e0 57 db          	vxorps %xmm3,%xmm3,%xmm3
 27c:	c4 e3 61 0c 44 24 30 	vblendps $0x1,0x30(%rsp),%xmm3,%xmm0
 283:	01
 284:	c5 f0 57 c9          	vxorps %xmm1,%xmm1,%xmm1
 288:	c4 e3 61 0c 54 24 20 	vblendps $0x1,0x20(%rsp),%xmm3,%xmm2
 28f:	01
 290:	c4 e3 61 0c 64 24 40 	vblendps $0x1,0x40(%rsp),%xmm3,%xmm4
 297:	01
 298:	b8 00 00 00 00       	mov    $0x0,%eax
 29d:	c5 e0 57 db          	vxorps %xmm3,%xmm3,%xmm3
 2a1:	c5 d0 57 ed          	vxorps %xmm5,%xmm5,%xmm5
 2a5:	62 61 7c 48 28 44 24 	vmovaps 0x300(%rsp),%zmm24
 2ac:	0c
 2ad:	62 61 7c 48 28 4c 24 	vmovaps 0x2c0(%rsp),%zmm25
 2b4:	0b
 2b5:	62 61 7c 48 28 54 24 	vmovaps 0x280(%rsp),%zmm26
 2bc:	0a
 2bd:	62 61 7c 48 28 5c 24 	vmovaps 0x240(%rsp),%zmm27
 2c4:	09
 2c5:	62 61 7c 48 28 64 24 	vmovaps 0x200(%rsp),%zmm28
 2cc:	08
 2cd:	62 61 7c 48 28 6c 24 	vmovaps 0x1c0(%rsp),%zmm29
 2d4:	07
 2d5:	62 61 7c 48 28 74 24 	vmovaps 0x180(%rsp),%zmm30
 2dc:	06
 2dd:	62 61 7c 48 28 7c 24 	vmovaps 0x140(%rsp),%zmm31
 2e4:	05
 2e5:	62 e1 7c 48 28 6c 24 	vmovaps 0x100(%rsp),%zmm21
 2ec:	04
 2ed:	62 e1 7c 48 28 74 24 	vmovaps 0xc0(%rsp),%zmm22
 2f4:	03
 2f5:	62 e1 7c 48 28 7c 24 	vmovaps 0x80(%rsp),%zmm23
 2fc:	02
 2fd:	0f 1f 00             	nopl   (%rax)
 300:	62 b1 7c 48 28 3c 30 	vmovaps (%rax,%r14,1),%zmm7
 307:	62 31 7c 48 28 44 30 	vmovaps 0x40(%rax,%r14,1),%zmm8
 30e:	01
 30f:	62 31 7c 48 28 4c 30 	vmovaps 0x80(%rax,%r14,1),%zmm9
 316:	02
 317:	62 31 7c 48 28 54 30 	vmovaps 0xc0(%rax,%r14,1),%zmm10
 31e:	03
 31f:	62 31 7c 48 28 5c 30 	vmovaps 0x140(%rax,%r14,1),%zmm11
 326:	05
 327:	62 31 7c 48 28 64 30 	vmovaps 0x100(%rax,%r14,1),%zmm12
 32e:	04
 32f:	62 31 7c 48 28 6c 30 	vmovaps 0x1c0(%rax,%r14,1),%zmm13
 336:	07
 337:	62 31 7c 48 28 74 30 	vmovaps 0x180(%rax,%r14,1),%zmm14
 33e:	06
 33f:	62 d1 7c 48 28 f1    	vmovaps %zmm9,%zmm6
 345:	62 d2 3d 40 7f f2    	vpermt2ps %zmm10,%zmm24,%zmm6
 34b:	62 71 7c 48 28 ff    	vmovaps %zmm7,%zmm15
 351:	62 52 35 40 7f f8    	vpermt2ps %zmm8,%zmm25,%zmm15
 357:	62 c1 7c 48 28 c6    	vmovaps %zmm14,%zmm16
 35d:	62 c2 3d 40 7f c5    	vpermt2ps %zmm13,%zmm24,%zmm16
 363:	62 c1 7c 48 28 cc    	vmovaps %zmm12,%zmm17
 369:	62 73 85 48 23 fe e4 	vshuff64x2 $0xe4,%zmm6,%zmm15,%zmm15
 370:	62 c2 35 40 7f cb    	vpermt2ps %zmm11,%zmm25,%zmm17
 376:	62 b3 f5 40 23 f0 e4 	vshuff64x2 $0xe4,%zmm16,%zmm17,%zmm6
 37d:	62 c1 7c 48 28 c1    	vmovaps %zmm9,%zmm16
 383:	62 c2 2d 40 7f c2    	vpermt2ps %zmm10,%zmm26,%zmm16
 389:	62 e1 7c 48 28 cf    	vmovaps %zmm7,%zmm17
 38f:	62 c2 25 40 7f c8    	vpermt2ps %zmm8,%zmm27,%zmm17
 395:	62 c1 7c 48 28 d6    	vmovaps %zmm14,%zmm18
 39b:	62 c2 2d 40 7f d5    	vpermt2ps %zmm13,%zmm26,%zmm18
 3a1:	62 c1 7c 48 28 dc    	vmovaps %zmm12,%zmm19
 3a7:	62 a3 f5 40 23 c0 e4 	vshuff64x2 $0xe4,%zmm16,%zmm17,%zmm16
 3ae:	62 c2 25 40 7f db    	vpermt2ps %zmm11,%zmm27,%zmm19
 3b4:	62 a3 e5 40 23 ca e4 	vshuff64x2 $0xe4,%zmm18,%zmm19,%zmm17
 3bb:	62 c1 7c 48 28 d1    	vmovaps %zmm9,%zmm18
 3c1:	62 c2 1d 40 7f d2    	vpermt2ps %zmm10,%zmm28,%zmm18
 3c7:	62 e1 7c 48 28 df    	vmovaps %zmm7,%zmm19
 3cd:	62 c2 15 40 7f d8    	vpermt2ps %zmm8,%zmm29,%zmm19
 3d3:	62 c1 7c 48 28 e6    	vmovaps %zmm14,%zmm20
 3d9:	62 c2 1d 40 7f e5    	vpermt2ps %zmm13,%zmm28,%zmm20
 3df:	62 a3 e5 40 23 d2 e4 	vshuff64x2 $0xe4,%zmm18,%zmm19,%zmm18
 3e6:	62 c1 7c 48 28 dc    	vmovaps %zmm12,%zmm19
 3ec:	62 c2 15 40 7f db    	vpermt2ps %zmm11,%zmm29,%zmm19
 3f2:	62 a3 e5 40 23 dc e4 	vshuff64x2 $0xe4,%zmm20,%zmm19,%zmm19
 3f9:	62 52 0d 40 7f ca    	vpermt2ps %zmm10,%zmm30,%zmm9
 3ff:	62 d2 05 40 7f f8    	vpermt2ps %zmm8,%zmm31,%zmm7
 405:	62 52 0d 40 7f f5    	vpermt2ps %zmm13,%zmm30,%zmm14
 40b:	62 d3 c5 48 23 f9 e4 	vshuff64x2 $0xe4,%zmm9,%zmm7,%zmm7
 412:	62 52 05 40 7f e3    	vpermt2ps %zmm11,%zmm31,%zmm12
 418:	62 53 9d 48 23 c6 e4 	vshuff64x2 $0xe4,%zmm14,%zmm12,%zmm8
 41f:	62 71 7c 48 28 74 24 	vmovaps 0x3c0(%rsp),%zmm14
 426:	0f
 427:	62 51 04 48 5c ce    	vsubps %zmm14,%zmm15,%zmm9
 42d:	62 71 7c 48 28 7c 24 	vmovaps 0x380(%rsp),%zmm15
 434:	0e
 435:	62 51 7c 40 5c d7    	vsubps %zmm15,%zmm16,%zmm10
 43b:	62 e1 7c 48 28 44 24 	vmovaps 0x340(%rsp),%zmm16
 442:	0d
 443:	62 31 6c 40 5c d8    	vsubps %zmm16,%zmm18,%zmm11
 449:	62 51 7c 48 28 e1    	vmovaps %zmm9,%zmm12
 44f:	62 32 35 48 a8 e5    	vfmadd213ps %zmm21,%zmm9,%zmm12
 455:	62 52 2d 48 b8 e2    	vfmadd231ps %zmm10,%zmm10,%zmm12
 45b:	62 52 25 48 b8 e3    	vfmadd231ps %zmm11,%zmm11,%zmm12
 461:	62 52 7d 48 4e ec    	vrsqrt14ps %zmm12,%zmm13
 467:	62 51 1c 48 59 e5    	vmulps %zmm13,%zmm12,%zmm12
 46d:	62 32 15 48 a8 e6    	vfmadd213ps %zmm22,%zmm13,%zmm12
 473:	62 31 14 48 59 ef    	vmulps %zmm23,%zmm13,%zmm13
 479:	62 51 14 48 59 e4    	vmulps %zmm12,%zmm13,%zmm12
 47f:	62 d1 44 48 59 fc    	vmulps %zmm12,%zmm7,%zmm7
 485:	62 51 1c 48 59 e4    	vmulps %zmm12,%zmm12,%zmm12
 48b:	62 f1 1c 48 59 ff    	vmulps %zmm7,%zmm12,%zmm7
 491:	62 d2 45 48 b8 e1    	vfmadd231ps %zmm9,%zmm7,%zmm4
 497:	62 d2 45 48 b8 d2    	vfmadd231ps %zmm10,%zmm7,%zmm2
 49d:	62 d1 4c 48 5c f6    	vsubps %zmm14,%zmm6,%zmm6
 4a3:	62 51 74 40 5c cf    	vsubps %zmm15,%zmm17,%zmm9
 4a9:	62 31 64 40 5c d0    	vsubps %zmm16,%zmm19,%zmm10
 4af:	62 d2 45 48 b8 c3    	vfmadd231ps %zmm11,%zmm7,%zmm0
 4b5:	62 f1 7c 48 28 fe    	vmovaps %zmm6,%zmm7
 4bb:	62 b2 4d 48 a8 fd    	vfmadd213ps %zmm21,%zmm6,%zmm7
 4c1:	62 d2 35 48 b8 f9    	vfmadd231ps %zmm9,%zmm9,%zmm7
 4c7:	62 d2 2d 48 b8 fa    	vfmadd231ps %zmm10,%zmm10,%zmm7
 4cd:	62 72 7d 48 4e df    	vrsqrt14ps %zmm7,%zmm11
 4d3:	62 d1 44 48 59 fb    	vmulps %zmm11,%zmm7,%zmm7
 4d9:	62 b2 25 48 a8 fe    	vfmadd213ps %zmm22,%zmm11,%zmm7
 4df:	62 31 24 48 59 df    	vmulps %zmm23,%zmm11,%zmm11
 4e5:	62 f1 24 48 59 ff    	vmulps %zmm7,%zmm11,%zmm7
 4eb:	62 71 3c 48 59 c7    	vmulps %zmm7,%zmm8,%zmm8
 4f1:	62 f1 44 48 59 ff    	vmulps %zmm7,%zmm7,%zmm7
 4f7:	62 d1 44 48 59 f8    	vmulps %zmm8,%zmm7,%zmm7
 4fd:	62 f2 45 48 b8 ee    	vfmadd231ps %zmm6,%zmm7,%zmm5
 503:	62 d2 45 48 b8 d9    	vfmadd231ps %zmm9,%zmm7,%zmm3
 509:	62 d2 45 48 b8 ca    	vfmadd231ps %zmm10,%zmm7,%zmm1
 50f:	48 05 00 02 00 00    	add    $0x200,%rax
 515:	48 3d 00 10 00 00    	cmp    $0x1000,%rax
 51b:	0f 85 df fd ff ff    	jne    300 <simplified_nbody+0x300>
 521:	62 f1 54 48 58 e4    	vaddps %zmm4,%zmm5,%zmm4
 527:	62 f3 fd 48 1b e5 01 	vextractf64x4 $0x1,%zmm4,%ymm5
 52e:	62 f1 5c 48 58 e5    	vaddps %zmm5,%zmm4,%zmm4
 534:	c4 e3 7d 19 e5 01    	vextractf128 $0x1,%ymm4,%xmm5
 53a:	62 f1 5c 48 58 e5    	vaddps %zmm5,%zmm4,%zmm4
 540:	c4 e3 79 05 ec 01    	vpermilpd $0x1,%xmm4,%xmm5
 546:	62 f1 5c 48 58 e5    	vaddps %zmm5,%zmm4,%zmm4
 54c:	c5 fa 16 ec          	vmovshdup %xmm4,%xmm5
 550:	c5 d8 58 e5          	vaddps %xmm5,%xmm4,%xmm4
 554:	c5 f8 29 64 24 40    	vmovaps %xmm4,0x40(%rsp)
 55a:	62 f1 64 48 58 d2    	vaddps %zmm2,%zmm3,%zmm2
 560:	62 f3 fd 48 1b d3 01 	vextractf64x4 $0x1,%zmm2,%ymm3
 567:	62 f1 6c 48 58 d3    	vaddps %zmm3,%zmm2,%zmm2
 56d:	c4 e3 7d 19 d3 01    	vextractf128 $0x1,%ymm2,%xmm3
 573:	62 f1 6c 48 58 d3    	vaddps %zmm3,%zmm2,%zmm2
 579:	c4 e3 79 05 da 01    	vpermilpd $0x1,%xmm2,%xmm3
 57f:	62 f1 6c 48 58 d3    	vaddps %zmm3,%zmm2,%zmm2
 585:	c5 fa 16 da          	vmovshdup %xmm2,%xmm3
 589:	c5 e8 58 d3          	vaddps %xmm3,%xmm2,%xmm2
 58d:	c5 f8 29 54 24 20    	vmovaps %xmm2,0x20(%rsp)
 593:	62 f1 74 48 58 c0    	vaddps %zmm0,%zmm1,%zmm0
 599:	62 f3 fd 48 1b c1 01 	vextractf64x4 $0x1,%zmm0,%ymm1
 5a0:	62 f1 7c 48 58 c1    	vaddps %zmm1,%zmm0,%zmm0
 5a6:	c4 e3 7d 19 c1 01    	vextractf128 $0x1,%ymm0,%xmm1
 5ac:	62 f1 7c 48 58 c1    	vaddps %zmm1,%zmm0,%zmm0
 5b2:	c4 e3 79 05 c8 01    	vpermilpd $0x1,%xmm0,%xmm1
 5b8:	62 f1 7c 48 58 c1    	vaddps %zmm1,%zmm0,%zmm0
 5be:	c5 fa 16 c8          	vmovshdup %xmm0,%xmm1
 5c2:	c5 f8 58 c1          	vaddps %xmm1,%xmm0,%xmm0
 5c6:	c5 f8 29 44 24 30    	vmovaps %xmm0,0x30(%rsp)
 5cc:	c5 f8 77             	vzeroupper
 5cf:	41 ff d5             	call   *%r13
 5d2:	41 81 c7 00 01 00 00 	add    $0x100,%r15d
 5d9:	41 83 c4 01          	add    $0x1,%r12d
 5dd:	44 3b 7c 24 1c       	cmp    0x1c(%rsp),%r15d
 5e2:	0f 82 68 fc ff ff    	jb     250 <simplified_nbody+0x250>
 5e8:	eb 1e                	jmp    608 <simplified_nbody+0x608>
 5ea:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
 5ee:	c5 f8 29 44 24 40    	vmovaps %xmm0,0x40(%rsp)
 5f4:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
 5f8:	c5 f8 29 44 24 20    	vmovaps %xmm0,0x20(%rsp)
 5fe:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
 602:	c5 f8 29 44 24 30    	vmovaps %xmm0,0x30(%rsp)
 608:	48 8b 44 24 50       	mov    0x50(%rsp),%rax
 60d:	c5 fa 10 00          	vmovss (%rax),%xmm0
 611:	c5 f8 28 54 24 40    	vmovaps 0x40(%rsp),%xmm2
 617:	c4 e2 79 a9 54 24 10 	vfmadd213ss 0x10(%rsp),%xmm0,%xmm2
 61e:	c5 f8 28 5c 24 20    	vmovaps 0x20(%rsp),%xmm3
 624:	c4 e2 79 a9 5c 24 14 	vfmadd213ss 0x14(%rsp),%xmm0,%xmm3
 62b:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
 632:	00 00 00
 635:	48 8b 4c 24 58       	mov    0x58(%rsp),%rcx
 63a:	c5 fa 10 0c 01       	vmovss (%rcx,%rax,1),%xmm1
 63f:	c5 f8 28 64 24 30    	vmovaps 0x30(%rsp),%xmm4
 645:	c4 e2 79 a9 64 24 18 	vfmadd213ss 0x18(%rsp),%xmm0,%xmm4
 64c:	c5 ea 59 d1          	vmulss %xmm1,%xmm2,%xmm2
 650:	c5 e2 59 d9          	vmulss %xmm1,%xmm3,%xmm3
 654:	c5 da 59 c9          	vmulss %xmm1,%xmm4,%xmm1
 658:	48 8b 44 24 70       	mov    0x70(%rsp),%rax
 65d:	48 8b 4c 24 60       	mov    0x60(%rsp),%rcx
 662:	c5 fa 10 24 08       	vmovss (%rax,%rcx,1),%xmm4
 667:	c4 e2 69 b9 e0       	vfmadd231ss %xmm0,%xmm2,%xmm4
 66c:	c5 fa 11 24 08       	vmovss %xmm4,(%rax,%rcx,1)
 671:	c5 fa 10 64 08 04    	vmovss 0x4(%rax,%rcx,1),%xmm4
 677:	c4 e2 61 b9 e0       	vfmadd231ss %xmm0,%xmm3,%xmm4
 67c:	c5 fa 11 64 08 04    	vmovss %xmm4,0x4(%rax,%rcx,1)
 682:	c4 e2 71 a9 44 08 08 	vfmadd213ss 0x8(%rax,%rcx,1),%xmm1,%xmm0
 689:	c5 fa 11 44 08 08    	vmovss %xmm0,0x8(%rax,%rcx,1)
 68f:	48 8b 44 24 68       	mov    0x68(%rsp),%rax
 694:	c5 fa 11 10          	vmovss %xmm2,(%rax)
 698:	c5 fa 11 58 04       	vmovss %xmm3,0x4(%rax)
 69d:	c5 fa 11 48 08       	vmovss %xmm1,0x8(%rax)
 6a2:	48 8d 65 d8          	lea    -0x28(%rbp),%rsp
 6a6:	5b                   	pop    %rbx
 6a7:	41 5c                	pop    %r12
 6a9:	41 5d                	pop    %r13
 6ab:	41 5e                	pop    %r14
 6ad:	41 5f                	pop    %r15
 6af:	5d                   	pop    %rbp
 6b0:	c3                   	ret

Host-Compute (ARM CPU)

nbody_aarch64.bin:	file format elf64-littleaarch64


Disassembly of section .text:

0000000000000000 <simplified_nbody>:
       0: ff c3 04 d1  	sub	sp, sp, #304
       4: 08 00 00 90  	adrp	x8, 0x0 <simplified_nbody+0x4>
       8: ed 33 0a 6d  	stp	d13, d12, [sp, #160]
       c: eb 2b 0b 6d  	stp	d11, d10, [sp, #176]
      10: e9 23 0c 6d  	stp	d9, d8, [sp, #192]
      14: fd 7b 0d a9  	stp	x29, x30, [sp, #208]
      18: fd 43 03 91  	add	x29, sp, #208
      1c: fc 6f 0e a9  	stp	x28, x27, [sp, #224]
      20: fa 67 0f a9  	stp	x26, x25, [sp, #240]
      24: f8 5f 10 a9  	stp	x24, x23, [sp, #256]
      28: f6 57 11 a9  	stp	x22, x21, [sp, #272]
      2c: f4 4f 12 a9  	stp	x20, x19, [sp, #288]
      30: 08 01 40 f9  	ldr	x8, [x8]
      34: 17 01 40 b9  	ldr	w23, [x8]
      38: 88 01 80 52  	mov	w8, #12
      3c: f6 0a 08 9b  	madd	x22, x23, x8, x2
      40: 08 00 00 90  	adrp	x8, 0x0 <simplified_nbody+0x40>
      44: f8 03 16 aa  	mov	x24, x22
      48: 08 01 40 f9  	ldr	x8, [x8]
      4c: c8 02 40 fd  	ldr	d8, [x22]
      50: 09 8f 40 bc  	ldr	s9, [x24, #8]!
      54: 19 01 40 b9  	ldr	w25, [x8]
      58: 79 0d 00 34  	cbz	w25, 0x204 <simplified_nbody+0x204>
      5c: 03 e4 00 2f  	movi	d3, #0000000000000000
      60: 08 10 17 8b  	add	x8, x0, x23, lsl #4
      64: 1c 00 00 90  	adrp	x28, 0x0 <simplified_nbody+0x64>
      68: e3 07 00 a9  	stp	x3, x1, [sp]
      6c: 13 00 00 90  	adrp	x19, 0x0 <simplified_nbody+0x6c>
      70: 14 00 00 90  	adrp	x20, 0x0 <simplified_nbody+0x70>
      74: f5 03 00 aa  	mov	x21, x0
      78: fa 03 1f 2a  	mov	w26, wzr
      7c: 9c 03 40 f9  	ldr	x28, [x28]
      80: a3 03 9a 3c  	stur	q3, [x29, #-96]
      84: 00 05 40 2d  	ldp	s0, s1, [x8]
      88: 02 09 40 bd  	ldr	s2, [x8, #8]
      8c: e8 e2 96 52  	mov	w8, #46871
      90: 03 f6 03 4f  	fmov	v3.4s, #1.00000000
      94: 28 1a a7 72  	movk	w8, #14545, lsl #16
      98: fb 03 1f 2a  	mov	w27, wzr
      9c: e3 17 80 3d  	str	q3, [sp, #80]
      a0: 03 04 04 4e  	dup	v3.4s, v0.s[0]
      a4: 00 0d 04 4e  	dup	v0.4s, w8
      a8: 73 02 40 f9  	ldr	x19, [x19]
      ac: e0 8f 01 ad  	stp	q0, q3, [sp, #48]
      b0: 00 e4 00 2f  	movi	d0, #0000000000000000
      b4: 23 04 04 4e  	dup	v3.4s, v1.s[0]
      b8: e0 1b 80 3d  	str	q0, [sp, #96]
      bc: 00 e4 00 2f  	movi	d0, #0000000000000000
      c0: a0 03 9b 3c  	stur	q0, [x29, #-80]
      c4: 40 04 04 4e  	dup	v0.4s, v2.s[0]
      c8: 94 02 40 f9  	ldr	x20, [x20]
      cc: e0 8f 00 ad  	stp	q0, q3, [sp, #16]
      d0: 88 03 40 b9  	ldr	w8, [x28]
      d4: 09 21 1b 0b  	add	w9, w8, w27, lsl #8
      d8: a0 5a e9 3c  	ldr	q0, [x21, w9, uxtw #4]
      dc: 60 7a a8 3c  	str	q0, [x19, x8, lsl #4]
      e0: 80 02 3f d6  	blr	x20
      e4: 00 e4 00 6f  	movi	v0.2d, #0000000000000000
      e8: a4 03 da 3c  	ldur	q4, [x29, #-96]
      ec: 02 e4 00 6f  	movi	v2.2d, #0000000000000000
      f0: e8 03 1f aa  	mov	x8, xzr
      f4: 03 e4 00 6f  	movi	v3.2d, #0000000000000000
      f8: eb ab 41 ad  	ldp	q11, q10, [sp, #48]
      fc: 80 04 04 6e  	mov	v0.s[0], v4.s[0]
     100: 01 e4 00 6f  	movi	v1.2d, #0000000000000000
     104: 05 e4 00 6f  	movi	v5.2d, #0000000000000000
     108: ff 93 42 ad  	ldp	q31, q4, [sp, #80]
     10c: 82 04 04 6e  	mov	v2.s[0], v4.s[0]
     110: a4 03 db 3c  	ldur	q4, [x29, #-80]
     114: ed b3 40 ad  	ldp	q13, q12, [sp, #16]
     118: 83 04 04 6e  	mov	v3.s[0], v4.s[0]
     11c: 04 e4 00 6f  	movi	v4.2d, #0000000000000000
     120: 69 02 08 8b  	add	x9, x19, x8
     124: 08 01 02 91  	add	x8, x8, #128
     128: 67 1d ab 4e  	mov	v7.16b, v11.16b
     12c: 1f 05 40 f1  	cmp	x8, #1, lsl #12
     130: 7b 1d ab 4e  	mov	v27.16b, v11.16b
     134: 30 09 df 4c  	ld4	{ v16.4s, v17.4s, v18.4s, v19.4s }, [x9], #64
     138: 06 d6 aa 4e  	fsub	v6.4s, v16.4s, v10.4s
     13c: 38 d6 ac 4e  	fsub	v24.4s, v17.4s, v12.4s
     140: 59 d6 ad 4e  	fsub	v25.4s, v18.4s, v13.4s
     144: c7 cc 26 4e  	fmla	v7.4s, v6.4s, v6.4s
     148: 07 cf 38 4e  	fmla	v7.4s, v24.4s, v24.4s
     14c: 34 09 40 4c  	ld4	{ v20.4s, v21.4s, v22.4s, v23.4s }, [x9]
     150: 27 cf 39 4e  	fmla	v7.4s, v25.4s, v25.4s
     154: e7 f8 a1 6e  	fsqrt	v7.4s, v7.4s
     158: 9a d6 aa 4e  	fsub	v26.4s, v20.4s, v10.4s
     15c: bc d6 ac 4e  	fsub	v28.4s, v21.4s, v12.4s
     160: dd d6 ad 4e  	fsub	v29.4s, v22.4s, v13.4s
     164: 5b cf 3a 4e  	fmla	v27.4s, v26.4s, v26.4s
     168: e7 ff 27 6e  	fdiv	v7.4s, v31.4s, v7.4s
     16c: 9b cf 3c 4e  	fmla	v27.4s, v28.4s, v28.4s
     170: bb cf 3d 4e  	fmla	v27.4s, v29.4s, v29.4s
     174: 7b fb a1 6e  	fsqrt	v27.4s, v27.4s
     178: fe dc 27 6e  	fmul	v30.4s, v7.4s, v7.4s
     17c: 67 de 27 6e  	fmul	v7.4s, v19.4s, v7.4s
     180: c7 df 27 6e  	fmul	v7.4s, v30.4s, v7.4s
     184: fb ff 3b 6e  	fdiv	v27.4s, v31.4s, v27.4s
     188: e3 cc 26 4e  	fmla	v3.4s, v7.4s, v6.4s
     18c: e2 cc 38 4e  	fmla	v2.4s, v7.4s, v24.4s
     190: e0 cc 39 4e  	fmla	v0.4s, v7.4s, v25.4s
     194: 70 df 3b 6e  	fmul	v16.4s, v27.4s, v27.4s
     198: f1 de 3b 6e  	fmul	v17.4s, v23.4s, v27.4s
     19c: 10 de 31 6e  	fmul	v16.4s, v16.4s, v17.4s
     1a0: 05 ce 3a 4e  	fmla	v5.4s, v16.4s, v26.4s
     1a4: 04 ce 3c 4e  	fmla	v4.4s, v16.4s, v28.4s
     1a8: 01 ce 3d 4e  	fmla	v1.4s, v16.4s, v29.4s
     1ac: a1 fb ff 54  	b.ne	0x120 <simplified_nbody+0x120>
     1b0: a3 d4 23 4e  	fadd	v3.4s, v5.4s, v3.4s
     1b4: 20 d4 20 4e  	fadd	v0.4s, v1.4s, v0.4s
     1b8: 82 d4 22 4e  	fadd	v2.4s, v4.4s, v2.4s
     1bc: 61 d4 20 6e  	faddp	v1.4s, v3.4s, v0.4s
     1c0: 42 d4 20 6e  	faddp	v2.4s, v2.4s, v0.4s
     1c4: 00 d4 20 6e  	faddp	v0.4s, v0.4s, v0.4s
     1c8: 21 d8 30 7e  	faddp	s1, v1.2s
     1cc: 00 d8 30 7e  	faddp	s0, v0.2s
     1d0: a0 07 3d ad  	stp	q0, q1, [x29, #-96]
     1d4: 41 d8 30 7e  	faddp	s1, v2.2s
     1d8: e1 1b 80 3d  	str	q1, [sp, #96]
     1dc: 80 02 3f d6  	blr	x20
     1e0: 5a 03 04 11  	add	w26, w26, #256
     1e4: 7b 07 00 11  	add	w27, w27, #1
     1e8: 5f 03 19 6b  	cmp	w26, w25
     1ec: 23 f7 ff 54  	b.lo	0xd0 <simplified_nbody+0xd0>
     1f0: a2 07 7d ad  	ldp	q2, q1, [x29, #-96]
     1f4: e0 1b c0 3d  	ldr	q0, [sp, #96]
     1f8: e3 07 40 a9  	ldp	x3, x1, [sp]
     1fc: 01 04 0c 6e  	mov	v1.s[1], v0.s[0]
     200: 03 00 00 14  	b	0x20c <simplified_nbody+0x20c>
     204: 01 e4 00 2f  	movi	d1, #0000000000000000
     208: 02 e4 00 2f  	movi	d2, #0000000000000000
     20c: e8 ce 97 52  	mov	w8, #48759
     210: 60 00 40 bd  	ldr	s0, [x3]
     214: e8 ef a7 72  	movk	w8, #16255, lsl #16
     218: 29 10 17 8b  	add	x9, x1, x23, lsl #4
     21c: f4 4f 52 a9  	ldp	x20, x19, [sp, #288]
     220: 28 10 80 0f  	fmla	v8.2s, v1.2s, v0.s[0]
     224: 02 24 02 1f  	fmadd	s2, s0, s2, s9
     228: 01 0d 04 0e  	dup	v1.2s, w8
     22c: 03 01 27 1e  	fmov	s3, w8
     230: 24 01 40 fd  	ldr	d4, [x9]
     234: fa 67 4f a9  	ldp	x26, x25, [sp, #240]
     238: 42 08 23 1e  	fmul	s2, s2, s3
     23c: 01 dd 21 2e  	fmul	v1.2s, v8.2s, v1.2s
     240: 23 09 40 bd  	ldr	s3, [x9, #8]
     244: fc 6f 4e a9  	ldp	x28, x27, [sp, #224]
     248: 02 03 00 bd  	str	s2, [x24]
     24c: 24 10 80 0f  	fmla	v4.2s, v1.2s, v0.s[0]
     250: 40 0c 00 1f  	fmadd	s0, s2, s0, s3
     254: c1 02 00 fd  	str	d1, [x22]
     258: f6 57 51 a9  	ldp	x22, x21, [sp, #272]
     25c: f8 5f 50 a9  	ldp	x24, x23, [sp, #256]
     260: 24 01 00 fd  	str	d4, [x9]
     264: fd 7b 4d a9  	ldp	x29, x30, [sp, #208]
     268: 20 09 00 bd  	str	s0, [x9, #8]
     26c: e9 23 4c 6d  	ldp	d9, d8, [sp, #192]
     270: eb 2b 4b 6d  	ldp	d11, d10, [sp, #176]
     274: ed 33 4a 6d  	ldp	d13, d12, [sp, #160]
     278: ff c3 04 91  	add	sp, sp, #304
     27c: c0 03 5f d6  	ret

Metal / AIR

Note that the compiler would usually directly output a .metallib file. The output below comes from disassembling it with metallib-dis (provided by the toolchain).

; ModuleID = 'bc_module'
source_filename = "simplified_nbody.cpp"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx11.0.0"

%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }

@_ZZ16simplified_nbodyE20local_body_positions = internal addrspace(3) unnamed_addr global [256 x %class.vector4] undef, align 16

; Function Attrs: convergent nounwind
define void @simplified_nbody(%class.vector4 addrspace(1)* noalias nocapture readonly, %class.vector4 addrspace(1)* noalias nocapture, %class.vector3 addrspace(1)* noalias nocapture, float addrspace(2)* noalias nocapture readonly dereferenceable(4), <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, i32, i32, i32, i32) local_unnamed_addr #0 !reqd_work_group_size !33 {
  %15 = extractelement <3 x i32> %4, i32 0
  %16 = zext i32 %15 to i64
  %17 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 0
  %18 = load float, float addrspace(1)* %17, align 4
  %19 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 1
  %20 = load float, float addrspace(1)* %19, align 4
  %21 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 2
  %22 = load float, float addrspace(1)* %21, align 4
  %23 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 0
  %24 = load float, float addrspace(1)* %23, align 4
  %25 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 1
  %26 = load float, float addrspace(1)* %25, align 4
  %27 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 2
  %28 = load float, float addrspace(1)* %27, align 4
  %29 = extractelement <3 x i32> %5, i32 0
  %30 = extractelement <3 x i32> %6, i32 0
  %31 = zext i32 %30 to i64
  %32 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 0
  %33 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 1
  %34 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 2
  %35 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %31, i32 0, i32 0, i32 3
  br label %59

; <label>:36:                                     ; preds = %76
  %37 = load float, float addrspace(2)* %3, align 4
  %38 = fmul fast float %37, %110
  %39 = fmul fast float %37, %111
  %40 = fmul fast float %37, %112
  %41 = fadd fast float %38, %24
  %42 = fadd fast float %39, %26
  %43 = fadd fast float %40, %28
  %44 = fmul fast float %41, 0x3FEFF7CEE0000000
  %45 = fmul fast float %42, 0x3FEFF7CEE0000000
  %46 = fmul fast float %43, 0x3FEFF7CEE0000000
  %47 = fmul fast float %44, %37
  %48 = fmul fast float %45, %37
  %49 = fmul fast float %46, %37
  %50 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 0
  %51 = load float, float addrspace(1)* %50, align 4, !tbaa !34
  %52 = fadd fast float %47, %51
  store float %52, float addrspace(1)* %50, align 4, !tbaa !34
  %53 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 1
  %54 = load float, float addrspace(1)* %53, align 4, !tbaa !34
  %55 = fadd fast float %48, %54
  store float %55, float addrspace(1)* %53, align 4, !tbaa !34
  %56 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 2
  %57 = load float, float addrspace(1)* %56, align 4, !tbaa !34
  %58 = fadd fast float %57, %49
  store float %58, float addrspace(1)* %56, align 4, !tbaa !34
  store float %44, float addrspace(1)* %23, align 4, !tbaa !34
  store float %45, float addrspace(1)* %25, align 4, !tbaa !34
  store float %46, float addrspace(1)* %27, align 4, !tbaa !34
  ret void

; <label>:59:                                     ; preds = %76, %14
  %60 = phi i32 [ 0, %14 ], [ %77, %76 ]
  %61 = phi i32 [ 0, %14 ], [ %78, %76 ]
  %62 = phi float [ 0.000000e+00, %14 ], [ %112, %76 ]
  %63 = phi float [ 0.000000e+00, %14 ], [ %111, %76 ]
  %64 = phi float [ 0.000000e+00, %14 ], [ %110, %76 ]
  %65 = shl i32 %61, 8
  %66 = add i32 %30, %65
  %67 = zext i32 %66 to i64
  %68 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 0
  %69 = load float, float addrspace(1)* %68, align 4
  %70 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 1
  %71 = load float, float addrspace(1)* %70, align 4
  %72 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 2
  %73 = load float, float addrspace(1)* %72, align 4
  %74 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %67, i32 0, i32 0, i32 3
  %75 = load float, float addrspace(1)* %74, align 4
  store float %69, float addrspace(3)* %32, align 4, !tbaa !34
  store float %71, float addrspace(3)* %33, align 4, !tbaa !34
  store float %73, float addrspace(3)* %34, align 4, !tbaa !34
  store float %75, float addrspace(3)* %35, align 4, !tbaa !34
  tail call void @air.wg.barrier(i32 2, i32 1) #3
  br label %80

; <label>:76:                                     ; preds = %80
  tail call void @air.wg.barrier(i32 2, i32 1) #3
  %77 = add i32 %60, 256
  %78 = add i32 %61, 1
  %79 = icmp ult i32 %77, %29
  br i1 %79, label %59, label %36

; <label>:80:                                     ; preds = %80, %59
  %81 = phi i32 [ 0, %59 ], [ %113, %80 ]
  %82 = phi float [ %62, %59 ], [ %112, %80 ]
  %83 = phi float [ %63, %59 ], [ %111, %80 ]
  %84 = phi float [ %64, %59 ], [ %110, %80 ]
  %85 = zext i32 %81 to i64
  %86 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 0
  %87 = load float, float addrspace(3)* %86, align 4
  %88 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 1
  %89 = load float, float addrspace(3)* %88, align 4
  %90 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 2
  %91 = load float, float addrspace(3)* %90, align 4
  %92 = fsub fast float %87, %18
  %93 = fsub fast float %89, %20
  %94 = fsub fast float %91, %22
  %95 = fmul fast float %92, %92
  %96 = fmul fast float %93, %93
  %97 = fmul fast float %94, %94
  %98 = fadd fast float %95, 0x3F1A36E2E0000000
  %99 = fadd fast float %98, %96
  %100 = fadd fast float %99, %97
  %101 = tail call fast float @air.fast_rsqrt.f32(float %100) #4
  %102 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %85, i32 0, i32 0, i32 3
  %103 = load float, float addrspace(3)* %102, align 4, !tbaa !34
  %104 = fmul fast float %101, %101
  %105 = fmul fast float %104, %101
  %106 = fmul fast float %105, %103
  %107 = fmul fast float %106, %92
  %108 = fmul fast float %106, %93
  %109 = fmul fast float %106, %94
  %110 = fadd fast float %107, %84
  %111 = fadd fast float %108, %83
  %112 = fadd fast float %109, %82
  %113 = add nuw nsw i32 %81, 1
  %114 = icmp eq i32 %113, 256
  br i1 %114, label %76, label %80
}

; Function Attrs: convergent nounwind readnone
declare float @air.fast_rsqrt.f32(float) local_unnamed_addr #1

; Function Attrs: convergent noduplicate
declare void @air.wg.barrier(i32, i32) local_unnamed_addr #2

attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #2 = { convergent noduplicate "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #3 = { convergent noduplicate nounwind }
attributes #4 = { convergent nounwind readnone }

!air.kernel = !{!0}
!air.version = !{!18}
!air.language_version = !{!19}
!air.compile_options = !{!20, !21, !22}
!air.source_file_name = !{!23}
!llvm.module.flags = !{!24, !25, !26, !27, !28, !29, !30, !31}
!llvm.ident = !{!32}

!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float addrspace(2)*, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, i32, i32, i32, i32)* @simplified_nbody, !1, !2, !17}
!1 = !{}
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16}
!3 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read", !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"const vector4<float>", !"air.arg_name", !"in_positions"}
!4 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"vector4<float>", !"air.arg_name", !"out_positions"}
!5 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.arg_type_size", i32 12, !"air.arg_type_align_size", i32 12, !"air.arg_type_name", !"vector3<float>", !"air.arg_name", !"inout_velocities"}
!6 = !{i32 3, !"air.buffer", !"air.buffer_size", i32 4, !"air.location_index", i32 3, i32 1, !"air.read", !"air.arg_type_size", i32 4, !"air.arg_type_align_size", i32 4, !"air.arg_type_name", !"float", !"air.arg_name", !"time_delta"}
!7 = !{i32 4, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_id__"}
!8 = !{i32 5, !"air.threads_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_size__"}
!9 = !{i32 6, !"air.thread_position_in_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_id__"}
!10 = !{i32 7, !"air.threads_per_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_size__"}
!11 = !{i32 8, !"air.threadgroup_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_id__"}
!12 = !{i32 9, !"air.threadgroups_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_size__"}
!13 = !{i32 10, !"air.simdgroup_index_in_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_id__"}
!14 = !{i32 11, !"air.thread_index_in_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_local_id__"}
!15 = !{i32 12, !"air.threads_per_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_size__"}
!16 = !{i32 13, !"air.simdgroups_per_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__num_sub_groups__"}
!17 = !{!"air.max_work_group_size", i32 256}
!18 = !{i32 2, i32 3, i32 0}
!19 = !{!"Metal", i32 2, i32 3, i32 0}
!20 = !{!"air.compile.denorms_disable"}
!21 = !{!"air.compile.fast_math_enable"}
!22 = !{!"air.compile.framebuffer_fetch_enable"}
!23 = !{!"simplified_nbody.cpp"}
!24 = !{i32 7, !"air.max_device_buffers", i32 31}
!25 = !{i32 7, !"air.max_constant_buffers", i32 31}
!26 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!27 = !{i32 7, !"air.max_textures", i32 128}
!28 = !{i32 7, !"air.max_read_write_textures", i32 8}
!29 = !{i32 7, !"air.max_samplers", i32 16}
!30 = !{i32 1, !"wchar_size", i32 4}
!31 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 0]}
!32 = !{!"Apple LLVM version 31001.143 (metalfe-31001.143)"}
!33 = !{i32 256, i32 1, i32 1}
!34 = !{!35, !35, i64 0}
!35 = !{!"omnipotent char", !36, i64 0}
!36 = !{!"Simple C++ TBAA"}

OpenCL / SPIR

Note that the compiler would usually directly output a .bc file. The output below comes from disassembling it with llvm-dis (provided by the toolchain). Also note that the bitcode file is exported in a LLVM 3.2 / SPIR 1.2 compatible format, but the output below uses LLVM 8.0 syntax.

; ModuleID = 'spir.bc'
source_filename = "spir.bc"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
target triple = "spir64-unknown-unknown"

%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }

@simplified_nbody.local_body_positions = internal unnamed_addr addrspace(3) global [256 x %class.vector4] zeroinitializer, align 4

define floor_kernel void @simplified_nbody(%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float) {
  %5 = tail call floor_func i64 @_Z13get_global_idj(i32 0), !range !13
  %6 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 0
  %7 = load float, float addrspace(1)* %6, align 4
  %8 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 1
  %9 = load float, float addrspace(1)* %8, align 4
  %10 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 2
  %11 = load float, float addrspace(1)* %10, align 4
  %12 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 0
  %13 = load float, float addrspace(1)* %12, align 4
  %14 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 1
  %15 = load float, float addrspace(1)* %14, align 4
  %16 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 2
  %17 = load float, float addrspace(1)* %16, align 4
  %18 = tail call floor_func i64 @_Z15get_global_sizej(i32 0), !range !14
  %19 = trunc i64 %18 to i32, !range !15
  %20 = tail call floor_func i64 @_Z12get_local_idj(i32 0), !range !16
  %21 = trunc i64 %20 to i32, !range !17
  %22 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 0
  %23 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 1
  %24 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 2
  %25 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 3
  br label %48

; <label>:26:                                     ; preds = %65
  %27 = fmul float %98, %3
  %28 = fmul float %99, %3
  %29 = fmul float %100, %3
  %30 = fadd float %27, %13
  %31 = fadd float %28, %15
  %32 = fadd float %29, %17
  %33 = fmul float %30, 0x3FEFF7CEE0000000
  %34 = fmul float %31, 0x3FEFF7CEE0000000
  %35 = fmul float %32, 0x3FEFF7CEE0000000
  %36 = fmul float %33, %3
  %37 = fmul float %34, %3
  %38 = fmul float %35, %3
  %39 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 0
  %40 = load float, float addrspace(1)* %39, align 4, !tbaa !18
  %41 = fadd float %40, %36
  store float %41, float addrspace(1)* %39, align 4, !tbaa !18
  %42 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 1
  %43 = load float, float addrspace(1)* %42, align 4, !tbaa !18
  %44 = fadd float %43, %37
  store float %44, float addrspace(1)* %42, align 4, !tbaa !18
  %45 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 2
  %46 = load float, float addrspace(1)* %45, align 4, !tbaa !18
  %47 = fadd float %46, %38
  store float %47, float addrspace(1)* %45, align 4, !tbaa !18
  store float %33, float addrspace(1)* %12, align 4, !tbaa !18
  store float %34, float addrspace(1)* %14, align 4, !tbaa !18
  store float %35, float addrspace(1)* %16, align 4, !tbaa !18
  ret void

; <label>:48:                                     ; preds = %65, %4
  %49 = phi i32 [ 0, %4 ], [ %66, %65 ]
  %50 = phi i32 [ 0, %4 ], [ %67, %65 ]
  %51 = phi float [ 0.000000e+00, %4 ], [ %100, %65 ]
  %52 = phi float [ 0.000000e+00, %4 ], [ %99, %65 ]
  %53 = phi float [ 0.000000e+00, %4 ], [ %98, %65 ]
  %54 = shl i32 %50, 8
  %55 = add i32 %54, %21
  %56 = zext i32 %55 to i64
  %57 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 0
  %58 = load float, float addrspace(1)* %57, align 4
  %59 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 1
  %60 = load float, float addrspace(1)* %59, align 4
  %61 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 2
  %62 = load float, float addrspace(1)* %61, align 4
  %63 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 3
  %64 = load float, float addrspace(1)* %63, align 4
  store float %58, float addrspace(3)* %22, align 4, !tbaa !18
  store float %60, float addrspace(3)* %23, align 4, !tbaa !18
  store float %62, float addrspace(3)* %24, align 4, !tbaa !18
  store float %64, float addrspace(3)* %25, align 4, !tbaa !18
  tail call floor_func void @_Z7barrierj(i32 1)
  br label %69

; <label>:65:                                     ; preds = %69
  tail call floor_func void @_Z7barrierj(i32 1)
  %66 = add i32 %49, 256
  %67 = add i32 %50, 1
  %68 = icmp ult i32 %66, %19
  br i1 %68, label %48, label %26

; <label>:69:                                     ; preds = %69, %48
  %70 = phi i64 [ 0, %48 ], [ %101, %69 ]
  %71 = phi float [ %51, %48 ], [ %100, %69 ]
  %72 = phi float [ %52, %48 ], [ %99, %69 ]
  %73 = phi float [ %53, %48 ], [ %98, %69 ]
  %74 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 0
  %75 = load float, float addrspace(3)* %74, align 4
  %76 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 1
  %77 = load float, float addrspace(3)* %76, align 4
  %78 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 2
  %79 = load float, float addrspace(3)* %78, align 4
  %80 = fsub float %75, %7
  %81 = fsub float %77, %9
  %82 = fsub float %79, %11
  %83 = fmul float %80, %80
  %84 = fmul float %81, %81
  %85 = fmul float %82, %82
  %86 = fadd float %83, 0x3F1A36E2E0000000
  %87 = fadd float %86, %84
  %88 = fadd float %87, %85
  %89 = tail call floor_func float @_Z5rsqrtf(float %88)
  %90 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 3
  %91 = load float, float addrspace(3)* %90, align 4, !tbaa !18
  %92 = fmul float %89, %89
  %93 = fmul float %92, %89
  %94 = fmul float %93, %91
  %95 = fmul float %94, %80
  %96 = fmul float %94, %81
  %97 = fmul float %94, %82
  %98 = fadd float %95, %73
  %99 = fadd float %96, %72
  %100 = fadd float %97, %71
  %101 = add nuw nsw i64 %70, 1
  %102 = icmp eq i64 %101, 256
  br i1 %102, label %65, label %69
}

declare floor_func i64 @_Z13get_global_idj(i32)

declare floor_func i64 @_Z15get_global_sizej(i32)

declare floor_func i64 @_Z12get_local_idj(i32)

declare floor_func float @_Z5rsqrtf(float)

declare floor_func void @_Z7barrierj(i32)

!opencl.kernels = !{!0}
!llvm.module.flags = !{!7}
!opencl.ocl.version = !{!8}
!opencl.spir.version = !{!8}
!opencl.enable.FP_CONTRACT = !{}
!opencl.used.extensions = !{!9}
!opencl.used.optional.core.features = !{!10}
!opencl.compiler.options = !{!11}
!llvm.ident = !{!12}

!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float)* @simplified_nbody, !1, !2, !3, !4, !5, !6}
!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0}
!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
!3 = !{!"kernel_arg_type", !"compute_global_buffer<const float4>", !"compute_global_buffer<float4>", !"compute_global_buffer<float3>", !"param<float>"}
!4 = !{!"kernel_arg_base_type", !"struct __class vector4<float>*", !"struct __class vector4<float>*", !"struct __class vector3<float>*", !"float"}
!5 = !{!"kernel_arg_type_qual", !"restrict const", !"restrict", !"restrict", !"const"}
!6 = !{!"kernel_arg_name", !"in_positions", !"out_positions", !"inout_velocities", !"time_delta"}
!7 = !{i32 1, !"wchar_size", i32 4}
!8 = !{i32 1, i32 2}
!9 = !{!"cl_khr_byte_addressable_store", !"cl_khr_fp16", !"cl_khr_fp64", !"cl_khr_global_int32_base_atomics", !"cl_khr_global_int32_extended_atomics", !"cl_khr_local_int32_base_atomics", !"cl_khr_local_int32_extended_atomics", !"cl_khr_gl_msaa_sharing", !"cl_khr_mipmap_image", !"cl_khr_mipmap_image_writes"}
!10 = !{!"cl_doubles"}
!11 = !{!"-cl-kernel-arg-info", !"-cl-mad-enable", !"-cl-denorms-are-zero", !"-cl-unsafe-math-optimizations"}
!12 = !{!"clang version 8.0.0 (ssh://a2git/clang_bleeding_edge.git c39607838f2b421540b8e9ddf71e03101218afc2) (ssh://a2git/llvm_bleeding_edge.git 27830df56091d37ab3a605462417856d2d382d6d)"}
!13 = !{i64 0, i64 4294967295}
!14 = !{i64 1, i64 4294967295}
!15 = !{i32 1, i32 -1}
!16 = !{i64 0, i64 2048}
!17 = !{i32 0, i32 2048}
!18 = !{!19, !19, i64 0}
!19 = !{!"omnipotent char", !20, i64 0}
!20 = !{!"Simple C++ TBAA"}

OpenCL / SPIR-V

Note that the compiler would usually directly output a .spvc file (a simple container format for multiple SPIR-V binaries). The output below comes from disassembling it with spirv-dis (provided by the toolchain). Also note that the output below has been generated with extended readability (--debug-asm).

; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 158
; Schema: 0
                                         Capability Addresses
                                         Capability Linkage
                                         Capability Kernel
                                         Capability Float16
                                         Capability Int64
                                         Capability ImageBasic
                                         Capability ImageMipmap
                                         Extension "cl_khr_3d_image_writes"
                                         Extension "cl_khr_byte_addressable_store"
                                         Extension "cl_khr_fp16"
                                         Extension "cl_khr_fp64"
                                         Extension "cl_khr_gl_msaa_sharing"
                                         Extension "cl_khr_global_int32_base_atomics"
                                         Extension "cl_khr_global_int32_extended_atomics"
                                         Extension "cl_khr_local_int32_base_atomics"
                                         Extension "cl_khr_local_int32_extended_atomics"
                                         Extension "cl_khr_mipmap_image"
                                         Extension "cl_khr_mipmap_image_writes"
                                    %1 = ExtInstImport "OpenCL.std"
                                         MemoryModel Physical64 OpenCL
                                         EntryPoint Kernel %simplified_nbody "simplified_nbody" %simplified_nbody.local_body_positions
                                         Source OpenCL_CPP 202000
                                         Decorate %in_positions FuncParamAttr NoAlias
                                         Decorate %out_positions FuncParamAttr NoAlias
                                         Decorate %inout_velocities FuncParamAttr NoAlias
                                         Decorate %in_positions FuncParamAttr NoCapture
                                         Decorate %out_positions FuncParamAttr NoCapture
                                         Decorate %inout_velocities FuncParamAttr NoCapture
                                         Decorate %in_positions FuncParamAttr NoWrite
                                         Decorate %time_delta FuncParamAttr NoWrite
                                         Decorate %_Z7barrierj LinkageAttributes "_Z7barrierj" Import
                                         Decorate %_Z5rsqrtf LinkageAttributes "_Z5rsqrtf" Import
                                         Decorate %_Z12get_local_idj LinkageAttributes "_Z12get_local_idj" Import
                                         Decorate %_Z13get_global_idj LinkageAttributes "_Z13get_global_idj" Import
                                         Decorate %_Z15get_global_sizej LinkageAttributes "_Z15get_global_sizej" Import
                                         Decorate %simplified_nbody.local_body_positions Alignment 4
                                %ulong = TypeInt 64 0
                                 %uint = TypeInt 32 0
                                %256ul = Constant %ulong 256
                                   %0u = Constant %uint 0
                                   %1u = Constant %uint 1
                                   %2u = Constant %uint 2
                                  %0ul = Constant %ulong 0
                                   %3u = Constant %uint 3
                                   %8u = Constant %uint 8
                                  %1ul = Constant %ulong 1
                                 %256u = Constant %uint 256
                                %float = TypeFloat 32
                          %struct.anon = TypeStruct %float %float %float %float
                           %union.anon = TypeStruct %struct.anon
                        %class.vector4 = TypeStruct %union.anon
                 %class.vector4[256ul] = TypeArray %class.vector4 %256ul
     %(Workgroup)class.vector4[256ul]* = TypePointer Workgroup %class.vector4[256ul]
                            %ulong(#1) = TypeFunction %ulong %uint
                            %float(#1) = TypeFunction %float %float
                                 %void = TypeVoid
                             %void(#1) = TypeFunction %void %uint
       %(CrossWorkgroup)class.vector4* = TypePointer CrossWorkgroup %class.vector4
                        %struct.anon.9 = TypeStruct %float %float %float
                         %union.anon.8 = TypeStruct %struct.anon.9
                        %class.vector3 = TypeStruct %union.anon.8
       %(CrossWorkgroup)class.vector3* = TypePointer CrossWorkgroup %class.vector3
                             %void(#4) = TypeFunction %void %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector3* %float
               %(CrossWorkgroup)float* = TypePointer CrossWorkgroup %float
                    %(Workgroup)float* = TypePointer Workgroup %float
                                 %bool = TypeBool
                                 %0.0f = Constant %float 0
                      %9.99999975e-05f = Constant %float 9.99999975e-05
                         %0.999000013f = Constant %float 0.999000013
%simplified_nbody.local_body_positions = Variable %(Workgroup)class.vector4[256ul]* Workgroup

function ulong _Z13get_global_idj ( %ulong(#1) ) Pure {
                                   %14 = FunctionParameter %uint
}

function ulong _Z15get_global_sizej ( %ulong(#1) ) Pure {
                                   %16 = FunctionParameter %uint
}

function ulong _Z12get_local_idj ( %ulong(#1) ) Pure {
                                   %18 = FunctionParameter %uint
}

function float _Z5rsqrtf ( %float(#1) ) Pure {
                                   %21 = FunctionParameter %float
}

function void _Z7barrierj ( %void(#1) ) {
                                   %25 = FunctionParameter %uint
}

function void simplified_nbody ( %void(#4) ) {
                         %in_positions = FunctionParameter %(CrossWorkgroup)class.vector4*
                        %out_positions = FunctionParameter %(CrossWorkgroup)class.vector4*
                     %inout_velocities = FunctionParameter %(CrossWorkgroup)class.vector3*
                           %time_delta = FunctionParameter %float
37:
                                   %39 = FunctionCall %ulong %_Z13get_global_idj %0u
                                   %41 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %39 %0u %0u %0u
                                   %42 = Load %float %41 Aligned 4
                                   %44 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %39 %0u %0u %1u
                                   %45 = Load %float %44 Aligned 4
                                   %47 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %39 %0u %0u %2u
                                   %48 = Load %float %47 Aligned 4
                                   %49 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %inout_velocities %39 %0u %0u %0u
                                   %50 = Load %float %49 Aligned 4
                                   %51 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %inout_velocities %39 %0u %0u %1u
                                   %52 = Load %float %51 Aligned 4
                                   %53 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %inout_velocities %39 %0u %0u %2u
                                   %54 = Load %float %53 Aligned 4
                                   %55 = FunctionCall %ulong %_Z15get_global_sizej %0u
                                   %56 = UConvert %uint %55
                                   %57 = FunctionCall %ulong %_Z12get_local_idj %0u
                                   %58 = UConvert %uint %57
                                   %61 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %0u
                                   %62 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %1u
                                   %63 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %2u
                                   %65 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %57 %0u %0u %3u
                                         Branch %66

66:
                                   %69 = Phi %uint ( %0u <- %37, %67 <- %68 )
                                   %71 = Phi %uint ( %0u <- %37, %70 <- %68 )
                                   %74 = Phi %float ( %0.0f <- %37, %73 <- %68 )
                                   %76 = Phi %float ( %0.0f <- %37, %75 <- %68 )
                                   %78 = Phi %float ( %0.0f <- %37, %77 <- %68 )
                                   %80 = ShiftLeftLogical %uint %71 %8u
                                   %81 = IAdd %uint %80 %58
                                   %82 = UConvert %ulong %81
                                   %83 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %0u
                                   %84 = Load %float %83 Aligned 4
                                   %85 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %1u
                                   %86 = Load %float %85 Aligned 4
                                   %87 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %2u
                                   %88 = Load %float %87 Aligned 4
                                   %89 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %in_positions %82 %0u %0u %3u
                                   %90 = Load %float %89 Aligned 4
                                         Store %61 %84 Aligned 4
                                         Store %62 %86 Aligned 4
                                         Store %63 %88 Aligned 4
                                         Store %65 %90 Aligned 4
                                   %91 = FunctionCall %void %_Z7barrierj %1u
                                         Branch %92

68:
                                  %130 = FunctionCall %void %_Z7barrierj %1u
                                   %67 = IAdd %uint %69 %256u
                                   %70 = IAdd %uint %71 %1u
                                  %134 = ULessThan %bool %67 %56
                                         BranchConditional %134 %66 %135

92:
                                   %94 = Phi %ulong ( %0ul <- %66, %93 <- %92 )
                                   %95 = Phi %float ( %74 <- %66, %73 <- %92 )
                                   %96 = Phi %float ( %76 <- %66, %75 <- %92 )
                                   %97 = Phi %float ( %78 <- %66, %77 <- %92 )
                                   %98 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %0u
                                   %99 = Load %float %98 Aligned 4
                                  %100 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %1u
                                  %101 = Load %float %100 Aligned 4
                                  %102 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %2u
                                  %103 = Load %float %102 Aligned 4
                                  %104 = FSub %float %99 %42
                                  %105 = FSub %float %101 %45
                                  %106 = FSub %float %103 %48
                                  %107 = FMul %float %104 %104
                                  %108 = FMul %float %105 %105
                                  %109 = FMul %float %106 %106
                                  %111 = FAdd %float %107 %9.99999975e-05f
                                  %112 = FAdd %float %111 %108
                                  %113 = FAdd %float %112 %109
                                  %114 = FunctionCall %float %_Z5rsqrtf %113
                                  %115 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %94 %0u %0u %3u
                                  %116 = Load %float %115 Aligned 4
                                  %117 = FMul %float %114 %114
                                  %118 = FMul %float %117 %114
                                  %119 = FMul %float %118 %116
                                  %120 = FMul %float %119 %104
                                  %121 = FMul %float %119 %105
                                  %122 = FMul %float %119 %106
                                   %77 = FAdd %float %120 %97
                                   %75 = FAdd %float %121 %96
                                   %73 = FAdd %float %122 %95
                                   %93 = IAdd %ulong %94 %1ul
                                  %129 = IEqual %bool %93 %256ul
                                         BranchConditional %129 %68 %92

135:
                                  %136 = FMul %float %77 %time_delta
                                  %137 = FMul %float %75 %time_delta
                                  %138 = FMul %float %73 %time_delta
                                  %139 = FAdd %float %136 %50
                                  %140 = FAdd %float %137 %52
                                  %141 = FAdd %float %138 %54
                                  %143 = FMul %float %139 %0.999000013f
                                  %144 = FMul %float %140 %0.999000013f
                                  %145 = FMul %float %141 %0.999000013f
                                  %146 = FMul %float %143 %time_delta
                                  %147 = FMul %float %144 %time_delta
                                  %148 = FMul %float %145 %time_delta
                                  %149 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %out_positions %39 %0u %0u %0u
                                  %150 = Load %float %149 Aligned 4
                                  %151 = FAdd %float %150 %146
                                         Store %149 %151 Aligned 4
                                  %152 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %out_positions %39 %0u %0u %1u
                                  %153 = Load %float %152 Aligned 4
                                  %154 = FAdd %float %153 %147
                                         Store %152 %154 Aligned 4
                                  %155 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %out_positions %39 %0u %0u %2u
                                  %156 = Load %float %155 Aligned 4
                                  %157 = FAdd %float %156 %148
                                         Store %155 %157 Aligned 4
                                         Store %49 %143 Aligned 4
                                         Store %51 %144 Aligned 4
                                         Store %53 %145 Aligned 4
                                         Return

Vulkan / SPIR-V

; SPIR-V
; Version: 1.5
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 169
; Schema: 0
                                                   Capability Matrix
                                                   Capability Shader
                                                   Capability Int64
                                                   Capability UniformBufferArrayDynamicIndexing
                                                   Capability SampledImageArrayDynamicIndexing
                                                   Capability StorageBufferArrayDynamicIndexing
                                                   Capability StorageImageArrayDynamicIndexing
                                                   Capability MultiView
                                                   Capability VariablePointersStorageBuffer
                                                   Capability VariablePointers
                                              %1 = ExtInstImport "GLSL.std.450"
                                                   MemoryModel Logical GLSL450
                                                   EntryPoint GLCompute %simplified_nbody "simplified_nbody" %simplified_nbody.vulkan_uniform. %simplified_nbody.vulkan_uniform..1 %simplified_nbody.vulkan_uniform..2 %simplified_nbody.vulkan_uniform..3 %simplified_nbody.vulkan_builtin_input. %simplified_nbody.vulkan_builtin_input..4 %simplified_nbody.vulkan_builtin_input..5 %simplified_nbody.vulkan_builtin_input..6 %vulkan.immutable_samplers %_ZZ16simplified_nbodyE20local_body_positions
                                                   ExecutionMode %simplified_nbody LocalSize 128 1 1
                                                   Source GLSL 450
                                                   Decorate %52 SpecId 1
                                                   Decorate %53 SpecId 2
                                                   Decorate %54 SpecId 3
                                                   Decorate %enclose.class.vector4 Block
                                                   Decorate %enclose.class.vector4_0 Block
                                                   Decorate %enclose.class.vector3 Block
                                                   Decorate %enclose. Block
                                                   Decorate %class.vector3[] ArrayStride 12
                                                   Decorate %class.vector4[256l] ArrayStride 16
                                                   Decorate %class.vector4[] ArrayStride 16
                                                   Decorate %class.vector4[]_0 ArrayStride 16
                                                   Decorate %simplified_nbody.vulkan_builtin_input..6 BuiltIn NumWorkgroups
                                                   Decorate %simplified_nbody.vulkan_constant.workgroup_size BuiltIn WorkgroupSize
                                                   Decorate %simplified_nbody.vulkan_builtin_input..5 BuiltIn WorkgroupId
                                                   Decorate %simplified_nbody.vulkan_builtin_input..4 BuiltIn LocalInvocationId
                                                   Decorate %simplified_nbody.vulkan_builtin_input. BuiltIn GlobalInvocationId
                                                   Decorate %simplified_nbody.vulkan_uniform. NonWritable
                                                   Decorate %simplified_nbody.vulkan_uniform..3 NonWritable
                                                   Decorate %vulkan.immutable_samplers Binding 0
                                                   Decorate %simplified_nbody.vulkan_uniform. Binding 0
                                                   Decorate %simplified_nbody.vulkan_uniform..1 Binding 1
                                                   Decorate %simplified_nbody.vulkan_uniform..2 Binding 2
                                                   Decorate %simplified_nbody.vulkan_uniform..3 Binding 3
                                                   Decorate %vulkan.immutable_samplers DescriptorSet 0
                                                   Decorate %simplified_nbody.vulkan_uniform. DescriptorSet 1
                                                   Decorate %simplified_nbody.vulkan_uniform..1 DescriptorSet 1
                                                   Decorate %simplified_nbody.vulkan_uniform..2 DescriptorSet 1
                                                   Decorate %simplified_nbody.vulkan_uniform..3 DescriptorSet 1
                                                   MemberDecorate %class.vector4 0 Offset 0
                                                   MemberDecorate %union.anon 0 Offset 0
                                                   MemberDecorate %struct.anon 0 Offset 0
                                                   MemberDecorate %enclose.class.vector4 0 Offset 0
                                                   MemberDecorate %enclose.class.vector4_0 0 Offset 0
                                                   MemberDecorate %enclose.class.vector3 0 Offset 0
                                                   MemberDecorate %class.vector3 0 Offset 0
                                                   MemberDecorate %union.anon.8 0 Offset 0
                                                   MemberDecorate %struct.anon.9 0 Offset 0
                                                   MemberDecorate %enclose. 0 Offset 0
                                                   MemberDecorate %struct.anon 1 Offset 4
                                                   MemberDecorate %struct.anon.9 1 Offset 4
                                                   MemberDecorate %struct.anon 2 Offset 8
                                                   MemberDecorate %struct.anon.9 2 Offset 8
                                                   MemberDecorate %struct.anon 3 Offset 12
                                           %uint = TypeInt 32 0
                                          %ilong = TypeInt 64 1
                                           %iint = TypeInt 32 1
                                            %32u = Constant %uint 32
                                           %256l = Constant %ilong 256
                                             %52 = SpecConstant %uint 128
                                             %53 = SpecConstant %uint 1
                                             %54 = SpecConstant %uint 1
                                             %0i = Constant %iint 0
                                             %1i = Constant %iint 1
                                             %2i = Constant %iint 2
                                             %0l = Constant %ilong 0
                                             %3i = Constant %iint 3
                                             %8i = Constant %iint 8
                                           %256i = Constant %iint 256
                                            %32l = Constant %ilong 32
                                             %1l = Constant %ilong 1
                                        %Sampler = TypeSampler
                                   %Sampler[32u] = TypeArray %Sampler %32u
                 %(UniformConstant)Sampler[32u]* = TypePointer UniformConstant %Sampler[32u]
                                          %float = TypeFloat 32
                                    %struct.anon = TypeStruct %float %float %float %float
                                     %union.anon = TypeStruct %struct.anon
                                  %class.vector4 = TypeStruct %union.anon
                            %class.vector4[256l] = TypeArray %class.vector4 %256l
                %(Workgroup)class.vector4[256l]* = TypePointer Workgroup %class.vector4[256l]
                                           %void = TypeVoid
                                         %void() = TypeFunction %void
                                %class.vector4[] = TypeRuntimeArray %class.vector4
                          %enclose.class.vector4 = TypeStruct %class.vector4[]
          %(StorageBuffer)enclose.class.vector4* = TypePointer StorageBuffer %enclose.class.vector4
                              %class.vector4[]_0 = TypeRuntimeArray %class.vector4
                        %enclose.class.vector4_0 = TypeStruct %class.vector4[]_0
        %(StorageBuffer)enclose.class.vector4_0* = TypePointer StorageBuffer %enclose.class.vector4_0
                                  %struct.anon.9 = TypeStruct %float %float %float
                                   %union.anon.8 = TypeStruct %struct.anon.9
                                  %class.vector3 = TypeStruct %union.anon.8
                                %class.vector3[] = TypeRuntimeArray %class.vector3
                          %enclose.class.vector3 = TypeStruct %class.vector3[]
          %(StorageBuffer)enclose.class.vector3* = TypePointer StorageBuffer %enclose.class.vector3
                                       %enclose. = TypeStruct %float
                       %(StorageBuffer)enclose.* = TypePointer StorageBuffer %enclose.
                                       %<3xiint> = TypeVector %iint 3
                               %(Input)<3xiint>* = TypePointer Input %<3xiint>
                                       %<3xuint> = TypeVector %uint 3
                          %(StorageBuffer)float* = TypePointer StorageBuffer %float
                              %(Workgroup)float* = TypePointer Workgroup %float
                                           %bool = TypeBool
%simplified_nbody.vulkan_constant.workgroup_size = SpecConstantComposite %<3xuint> %52 %53 %54
                                           %0.0f = Constant %float 0
                                %9.99999975e-05f = Constant %float 9.99999975e-05
                                   %0.999000013f = Constant %float 0.999000013
                      %vulkan.immutable_samplers = Variable %(UniformConstant)Sampler[32u]* UniformConstant
   %_ZZ16simplified_nbodyE20local_body_positions = Variable %(Workgroup)class.vector4[256l]* Workgroup
               %simplified_nbody.vulkan_uniform. = Variable %(StorageBuffer)enclose.class.vector4* StorageBuffer
             %simplified_nbody.vulkan_uniform..1 = Variable %(StorageBuffer)enclose.class.vector4_0* StorageBuffer
             %simplified_nbody.vulkan_uniform..2 = Variable %(StorageBuffer)enclose.class.vector3* StorageBuffer
             %simplified_nbody.vulkan_uniform..3 = Variable %(StorageBuffer)enclose.* StorageBuffer
         %simplified_nbody.vulkan_builtin_input. = Variable %(Input)<3xiint>* Input
       %simplified_nbody.vulkan_builtin_input..4 = Variable %(Input)<3xiint>* Input
       %simplified_nbody.vulkan_builtin_input..5 = Variable %(Input)<3xiint>* Input
       %simplified_nbody.vulkan_builtin_input..6 = Variable %(Input)<3xiint>* Input

function void simplified_nbody ( %void() ) {
45:
                                             %56 = Bitcast %<3xiint> %simplified_nbody.vulkan_constant.workgroup_size
                                             %57 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input. Aligned 16
                                             %58 = CompositeExtract %iint %57 0
                                             %59 = SConvert %ilong %58
                                             %62 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %59 %0i %0i %0i
                                             %63 = Load %float %62 Aligned 4
                                             %65 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %59 %0i %0i %1i
                                             %66 = Load %float %65 Aligned 4
                                             %68 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %59 %0i %0i %2i
                                             %69 = Load %float %68 Aligned 4
                                             %70 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %59 %0i %0i %0i
                                             %71 = Load %float %70 Aligned 4
                                             %72 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %59 %0i %0i %1i
                                             %73 = Load %float %72 Aligned 4
                                             %74 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %59 %0i %0i %2i
                                             %75 = Load %float %74 Aligned 4
                                             %76 = CompositeExtract %iint %56 0
                                             %77 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input..6 Aligned 16
                                             %78 = CompositeExtract %iint %77 0
                                             %79 = IMul %iint %76 %78
                                             %80 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input..4 Aligned 16
                                             %81 = CompositeExtract %iint %80 0
                                             %82 = SConvert %ilong %81
                                             %85 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %0i
                                             %86 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %1i
                                             %87 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %2i
                                             %89 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %82 %0i %0i %3i
                                                   Branch %46

46:
                                             %91 = Phi %iint ( %0i <- %45, %90 <- %49 )
                                             %93 = Phi %iint ( %0i <- %45, %92 <- %49 )
                                             %96 = Phi %float ( %0.0f <- %45, %95 <- %49 )
                                             %98 = Phi %float ( %0.0f <- %45, %97 <- %49 )
                                            %100 = Phi %float ( %0.0f <- %45, %99 <- %49 )
                                            %102 = ShiftLeftLogical %iint %93 %8i
                                            %103 = IAdd %iint %81 %102
                                            %104 = SConvert %ilong %103
                                            %105 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %0i
                                            %106 = Load %float %105 Aligned 4
                                            %107 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %1i
                                            %108 = Load %float %107 Aligned 4
                                            %109 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %2i
                                            %110 = Load %float %109 Aligned 4
                                            %111 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %104 %0i %0i %3i
                                            %112 = Load %float %111 Aligned 4
                                                   Store %85 %106 Aligned 4
                                                   Store %86 %108 Aligned 4
                                                   Store %87 %110 Aligned 4
                                                   Store %89 %112 Aligned 4
                                                   ControlBarrier %2i %2i %256i
                                                   LoopMerge %50 %49 None
                                                   Branch %47

47:
                                            %115 = Phi %ilong ( %0l <- %46, %114 <- %48 )
                                            %116 = Phi %float ( %96 <- %46, %95 <- %48 )
                                            %117 = Phi %float ( %98 <- %46, %97 <- %48 )
                                            %118 = Phi %float ( %100 <- %46, %99 <- %48 )
                                                   LoopMerge %49 %48 None
                                                   Branch %48

48:
                                            %120 = ShiftLeftLogical %ilong %115 %32l
                                            %121 = ShiftRightArithmetic %ilong %120 %32l
                                            %122 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %0i
                                            %123 = Load %float %122 Aligned 4
                                            %124 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %1i
                                            %125 = Load %float %124 Aligned 4
                                            %126 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %2i
                                            %127 = Load %float %126 Aligned 4
                                            %128 = FSub %float %123 %63
                                            %129 = FSub %float %125 %66
                                            %130 = FSub %float %127 %69
                                            %132 = ExtInst %float %1 Fma %128 %128 %9.99999975e-05f
                                            %133 = ExtInst %float %1 Fma %129 %129 %132
                                            %134 = ExtInst %float %1 Fma %130 %130 %133
                                            %135 = ExtInst %float %1 InverseSqrt %134
                                            %136 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0l %121 %0i %0i %3i
                                            %137 = Load %float %136 Aligned 4
                                            %138 = FMul %float %135 %135
                                            %139 = FMul %float %138 %135
                                            %140 = FMul %float %139 %137
                                             %99 = ExtInst %float %1 Fma %140 %128 %118
                                             %97 = ExtInst %float %1 Fma %140 %129 %117
                                             %95 = ExtInst %float %1 Fma %140 %130 %116
                                            %114 = IAdd %ilong %115 %1l
                                            %147 = IEqual %bool %114 %256l
                                                   BranchConditional %147 %49 %47

49:
                                                   ControlBarrier %2i %2i %256i
                                             %90 = IAdd %iint %91 %256i
                                             %92 = IAdd %iint %93 %1i
                                            %150 = ULessThan %bool %90 %79
                                                   BranchConditional %150 %46 %50

50:
                                            %151 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..3 %0l %0i
                                            %152 = Load %float %151 Aligned 4
                                            %153 = ExtInst %float %1 Fma %152 %99 %71
                                            %154 = ExtInst %float %1 Fma %152 %97 %73
                                            %155 = ExtInst %float %1 Fma %152 %95 %75
                                            %157 = FMul %float %153 %0.999000013f
                                            %158 = FMul %float %154 %0.999000013f
                                            %159 = FMul %float %155 %0.999000013f
                                            %160 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %59 %0i %0i %0i
                                            %161 = Load %float %160 Aligned 4
                                            %162 = ExtInst %float %1 Fma %157 %152 %161
                                                   Store %160 %162 Aligned 4
                                            %163 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %59 %0i %0i %1i
                                            %164 = Load %float %163 Aligned 4
                                            %165 = ExtInst %float %1 Fma %158 %152 %164
                                                   Store %163 %165 Aligned 4
                                            %166 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %59 %0i %0i %2i
                                            %167 = Load %float %166 Aligned 4
                                            %168 = ExtInst %float %1 Fma %159 %152 %167
                                                   Store %166 %168 Aligned 4
                                                   Store %70 %157 Aligned 4
                                                   Store %72 %158 Aligned 4
                                                   Store %74 %159 Aligned 4
                                                   Return

Requirements

OS:
- only AMD64/Intel64/ARM64 are supported
- Windows: NT 6.1+
- macOS: 10.13+
- iOS: 12.0+
- Linux: any current x64 distribution
- other Unix: if other requirements are met
compiler/toolchain:
- Generic: Clang / LLVM / libc++ 10.0+
- macOS/iOS: Xcode 12.5+
- Windows (VS): VS2019 with provided clang/LLVM or Clang / LLVM for Windows
- Windows (MinGW): MSYS2 with Clang/LLVM/libc++ 10.0+
libraries and optional requirements:
- SDL2 2.0.4+
- OpenGL 4.1+ Core headers
- (opt) OpenCL: requires OpenCL 1.2+ SDK and CPU/GPU drivers (Intel, AMD)
- (opt) CUDA: requires sm_30+/Kepler+ GPU and CUDA 9.0+ drivers (CUDA SDK not required!)
- (opt) Metal: requires iOS 12.0+ and A7+ CPU/GPU, or macOS 10.13+ and appropriate GPU
- (opt) Host-Compute: requires just the compiler/toolchain that is stated above
- (opt) Vulkan: requires 1.2.142+ ICD loader / headers / SDK
- (opt) networking: requires Asio headers and OpenSSL 1.0.1+
- (opt) audio/OpenAL: requires OpenAL Soft
- (opt) VR: requires OpenVR

Build Instructions

Build Instructions (General / CLI)

run ./build.sh (use ./build.sh help to get a list of all options)
configuration of optional parts:
- to disable OpenCL: define FLOOR_NO_OPENCL or ./build.sh no-opencl
- to disable CUDA: define FLOOR_NO_CUDA or ./build.sh no-cuda
- to disable Metal (only affects macOS/iOS builds): define FLOOR_NO_METAL or ./build.sh no-metal
- to disable Host Compute: define FLOOR_NO_HOST_COMPUTE or ./build.sh no-host-compute
- to disable Vulkan: define FLOOR_NO_VULKAN or ./build.sh no-vulkan
- to disable network support (ssl/crypto/asio): define FLOOR_NO_NET or ./build.sh no-net
- to disable OpenAL: define FLOOR_NO_OPENAL or ./build.sh no-openal
- to disable VR: define FLOOR_NO_VR or ./build.sh no-vr
- to build with libstdc++ (GCC 10.0+) instead of libc++: ./build.sh libstdc++

Build Instructions (Xcode / macOS / iOS)

open floor.xcodeproj and build
some notes:
- almost all optional parts of floor are enabled here and you’ll have to install all dependencies or disable them manually
- Homebrew is the recommended way to install additional dependencies:
  /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
- (opt) download OpenVR and manually install it:
  - mkdir -p {/usr/local/include/openvr,/usr/local/lib}
  - cp openvr/headers/* /usr/local/include/openvr/
  - cp openvr/bin/osx32/libopenvr_api.dylib /usr/local/lib/
- command line tools might be necessary, install them with: xcode-select --install
- on iOS, either copy dependencies into your iPhoneOS and iPhoneSimulator SDK, or floor/ios/deps/{include,lib}
- iOS linker flags for a depending project: -lSDL2 -lfloor -lcrypto -lssl

Build Instructions (Visual Studio / CMake / vcpkg / Windows)

install Visual Studio 2019 16.9+
install Clang / LLVM for Windows or select clang in the VS installer
install Vulkan SDK
install vcpkg (somewhere, not within libfloor):
- git clone https://github.com/Microsoft/vcpkg.git
- cd vcpkg
- .\bootstrap-vcpkg.bat -disableMetrics
- .\vcpkg integrate install
install vcpkg packages:
- vcpkg --triplet x64-windows install sdl2 opengl opengl-registry OpenCL vulkan openssl-windows asio openal-soft openvr
in Visual Studio: open folder floor (wait a little until build files are generated)
select Debug or Release configuration and build
- NOTE: all dependencies (optional parts) are enabled here

Installation

Installation (Unix / macOS)

sudo mkdir -p /opt/floor/include
sudo ln -sf /path/to/floor /opt/floor/include/floor
sudo ln -sf /path/to/floor/bin /opt/floor/lib
alternatively: copy these files/folders there

Installation (Windows)

create a %%ProgramFiles%%/floor folder (C:/Program Files/floor)
inside this folder:
- create a lib folder
- VS2019:
  - copy everything from bin/ in there (dlls/lib/exp)
- MinGW/MSYS2:
  - copy libfloor_static.a/libfloord_static.a there
- create an include folder and copy the original floor folder in there (containing all floor source code)

Compute/Graphics Toolchain

automated builds for Linux, macOS and Windows can be found at: https://libfloor.org/builds/toolchain
NOTE: this requires a Unix environment with all LLVM build dependencies installed - use MSYS2 on Windows
NOTE: the absolute build path must not contain spaces
compile the toolchain:
- cd floor/etc/llvm140/ && ./build.sh
- if successful, package it (in addition to a .zip file, this also creates a folder with all necessary binaries and include files): ./pkg.sh
install the toolchain:
- Unix:
  - automatic:
    
    development: run ./deploy_dev.sh from the floor/etc/llvm140/ folder (this will create symlinks to everything in floor and floor/etc/llvm140)
    
    release: run ./deploy_pkg.sh from inside the toolchain package folder (floor/etc/llvm140/toolchain_140000_*; this will copy everything)
  - manual:
    
    copy the toolchain folder as toolchain to /opt/floor/ (should then be /opt/floor/toolchain/{bin,clang,libcxx})
    
    inside /opt/floor/toolchain, add a symlink to the floor include folder: sudo ln -sf ../include floor
- Windows:
  - copy the toolchain folder as toolchain to %%ProgramFiles%%/floor (should then be %%ProgramFiles%%/floor/toolchain/{bin,clang,libcxx})
  - inside %%ProgramFiles%%/floor/toolchain, copy the floor folder from the include folder above it into this folder
NOTE: this is the expected default setup - paths can be changed inside config.json (toolchain.generic.paths)

Misc Hints

when using X11 forwarding, set these env variables:
- export LIBGL_ALWAYS_INDIRECT=yes
- export SDL_VIDEO_X11_NODIRECTCOLOR=yes
depending on how your Linux distribution handles OpenCL headers and library, you might need to manually install OpenCL 1.2+ compatible ones

Projects and Examples using libfloor

floor_examples (dnn, nbody, warp, hlbvh, path tracer, other)
libwarp (image-space warping library)
oclraster (Flexible Rasterizer in OpenCL)
a2elight (Albion 2 Engine)
unibot (IRC bot)

Michoumichmich / floor