Flo’s Open libRary

Table of Contents

What is it?
- Example
Requirements
Build Instructions
Installation
- Installation (Unix / macOS)
- Installation (Windows)
Compute/Graphics Toolchain
Misc Hints
Projects and Examples using libfloor

What is it?

This project provides a unified compute & graphics host API, as well as a unified compute & graphics C++ device language and library to enable same-source CUDA/Host/Metal/OpenCL/Vulkan programming and execution.

The unified host API is implemented at compute and graphics. All backends (CUDA/Host/Metal/OpenCL/Vulkan) currently provide compute support, while graphics support is limited to Metal and Vulkan.

To provide a unified device language, a clang/LLVM/libc++ 14.0 toolchain has been modified.

Certain parts of libfloor are used by both host and device code (math and constexpr). Additional device library code is located at device.

Advanced examples can be found in the floor_examples repository.

Example

Let’s take this fairly simple C++ kernel below that computes the body/body-interactions in a N-body simulation and compile it for each backend. Note that loop unrolling is omitted for conciseness.

// define global constants
static constexpr constant const uint32_t NBODY_TILE_SIZE { 256u };
static constexpr constant const float NBODY_DAMPING { 0.999f };
static constexpr constant const float NBODY_SOFTENING { 0.01f };
// define a 1D kernel with a required local size of (NBODY_TILE_SIZE = 256, 1, 1)
kernel_1d(NBODY_TILE_SIZE)
void simplified_nbody(buffer<const float4> in_positions, // read-only global memory buffer
                      buffer<float4> out_positions, // read-write global memory buffer
                      buffer<float3> inout_velocities, // read-write global memory buffer
                      param<float> time_delta) { // read-only parameter
  // each work-item represents/computes one body
  const auto position = in_positions[global_id.x];
  auto velocity = inout_velocities[global_id.x];
  float3 acceleration; // vectors are automatically zero-initialized
  local_buffer<float4, NBODY_TILE_SIZE> local_body_positions; // local memory array allocation
  // loop over all bodies
  for (uint32_t i = 0, tile = 0, count = global_size.x; i < count; i += NBODY_TILE_SIZE, ++tile) {
    // move resp. body position/mass from global to local memory
    local_body_positions[local_id.x] = in_positions[tile * NBODY_TILE_SIZE + local_id.x];
    local_barrier(); // barrier across all work-items in this work-group
    // loop over bodies in this work-group
    for (uint32_t j = 0; j < NBODY_TILE_SIZE; ++j) {
      const auto r = local_body_positions[j].xyz - position.xyz;
      const auto dist_sq = r.dot(r) + (NBODY_SOFTENING * NBODY_SOFTENING);
      const auto inv_dist = rsqrt(dist_sq);
      const auto s = local_body_positions[j].w * (inv_dist * inv_dist * inv_dist); // .w is mass
      acceleration += r * s;
    }
    local_barrier();
  }
  velocity = (velocity + acceleration * time_delta) * NBODY_DAMPING;
  out_positions[global_id.x].xyz += velocity * time_delta; // update XYZ position
  inout_velocities[global_id.x] = velocity; // update velocity
}

click to unfold the output for each backend

CUDA / PTX

You can download the PTX file here and the CUBIN file here (note that building CUBINs is optional and requires ptxas).

//
// Generated by LLVM NVPTX Back-End
//

.version 8.4
.target sm_86
.address_size 64

	// .globl	simplified_nbody
// _ZZ16simplified_nbodyE20local_body_positions has been demoted

.visible .entry simplified_nbody(
	.param .u64 simplified_nbody_param_0,
	.param .u64 simplified_nbody_param_1,
	.param .u64 simplified_nbody_param_2,
	.param .f32 simplified_nbody_param_3
)
.reqntid 256, 1, 1
{
	.reg .pred 	%p<3>;
	.reg .b32 	%r<25>;
	.reg .f32 	%f<71>;
	.reg .b64 	%rd<18>;
	// demoted variable
	.shared .align 4 .b8 _ZZ16simplified_nbodyE20local_body_positions[4096];
	mov.u32 	%r1, %tid.x;
	mov.u32 	%r11, %ntid.x;
	mov.u32 	%r12, %ctaid.x;
	mad.lo.s32 	%r13, %r12, %r11, %r1;
	cvt.u64.u32 	%rd3, %r13;
	mul.wide.u32 	%rd7, %r13, 12;
	ld.param.u64 	%rd8, [simplified_nbody_param_2];
	cvta.to.global.u64 	%rd9, %rd8;
	add.s64 	%rd4, %rd9, %rd7;
	ld.global.f32 	%f6, [%rd4+8];
	add.s64 	%rd6, %rd4, 8;
	ld.global.f32 	%f5, [%rd4+4];
	add.s64 	%rd5, %rd4, 4;
	ld.global.f32 	%f4, [%rd4];
	mul.wide.u32 	%rd10, %r13, 16;
	ld.param.u64 	%rd11, [simplified_nbody_param_0];
	cvta.to.global.u64 	%rd2, %rd11;
	add.s64 	%rd12, %rd2, %rd10;
	ld.global.nc.f32 	%f3, [%rd12+8];
	ld.global.nc.f32 	%f2, [%rd12+4];
	ld.global.nc.f32 	%f1, [%rd12];
	mov.u32 	%r14, %nctaid.x;
	mul.lo.s32 	%r2, %r14, %r11;
	shl.b32 	%r15, %r1, 4;
	mov.u32 	%r16, _ZZ16simplified_nbodyE20local_body_positions;
	add.s32 	%r3, %r16, %r15;
	ld.param.u64 	%rd13, [simplified_nbody_param_1];
	cvta.to.global.u64 	%rd1, %rd13;
	mov.f32 	%f68, 0f00000000;
	mov.u32 	%r10, 0;
	ld.param.f32 	%f16, [simplified_nbody_param_3];
	mov.u32 	%r22, %r10;
	mov.u32 	%r23, %r10;
	mov.f32 	%f69, %f68;
	mov.f32 	%f70, %f68;
LBB0_1:
	shl.b32 	%r18, %r23, 8;
	add.s32 	%r19, %r18, %r1;
	mul.wide.u32 	%rd14, %r19, 16;
	add.s64 	%rd15, %rd2, %rd14;
	ld.global.nc.f32 	%f18, [%rd15];
	st.shared.f32 	[%r3], %f18;
	ld.global.nc.f32 	%f19, [%rd15+4];
	st.shared.f32 	[%r3+4], %f19;
	ld.global.nc.f32 	%f20, [%rd15+8];
	st.shared.f32 	[%r3+8], %f20;
	ld.global.nc.f32 	%f21, [%rd15+12];
	st.shared.f32 	[%r3+12], %f21;
	barrier.sync 	0;
	mov.u32 	%r24, %r10;
LBB0_2:
	add.s32 	%r21, %r16, %r24;
	ld.shared.f32 	%f22, [%r21+4];
	sub.f32 	%f23, %f22, %f2;
	ld.shared.f32 	%f24, [%r21];
	sub.f32 	%f25, %f24, %f1;
	fma.rn.f32 	%f26, %f25, %f25, 0f38D1B717;
	fma.rn.f32 	%f27, %f23, %f23, %f26;
	ld.shared.f32 	%f28, [%r21+8];
	sub.f32 	%f29, %f28, %f3;
	fma.rn.f32 	%f30, %f29, %f29, %f27;
	rsqrt.approx.ftz.f32 	%f31, %f30;
	mul.f32 	%f32, %f31, %f31;
	mul.f32 	%f33, %f32, %f31;
	ld.shared.f32 	%f34, [%r21+12];
	mul.f32 	%f35, %f33, %f34;
	fma.rn.f32 	%f36, %f35, %f29, %f68;
	ld.shared.f32 	%f37, [%r21+20];
	sub.f32 	%f38, %f37, %f2;
	ld.shared.f32 	%f39, [%r21+16];
	sub.f32 	%f40, %f39, %f1;
	fma.rn.f32 	%f41, %f40, %f40, 0f38D1B717;
	fma.rn.f32 	%f42, %f38, %f38, %f41;
	ld.shared.f32 	%f43, [%r21+24];
	sub.f32 	%f44, %f43, %f3;
	fma.rn.f32 	%f45, %f44, %f44, %f42;
	rsqrt.approx.ftz.f32 	%f46, %f45;
	mul.f32 	%f47, %f46, %f46;
	mul.f32 	%f48, %f47, %f46;
	ld.shared.f32 	%f49, [%r21+28];
	mul.f32 	%f50, %f48, %f49;
	fma.rn.f32 	%f68, %f50, %f44, %f36;
	fma.rn.f32 	%f51, %f35, %f23, %f69;
	fma.rn.f32 	%f69, %f50, %f38, %f51;
	fma.rn.f32 	%f52, %f35, %f25, %f70;
	fma.rn.f32 	%f70, %f50, %f40, %f52;
	add.s32 	%r24, %r24, 32;
	setp.eq.s32 	%p1, %r24, 4096;
	@%p1 bra 	LBB0_3;
	bra.uni 	LBB0_2;
LBB0_3:
	add.s32 	%r22, %r22, 256;
	setp.lt.u32 	%p2, %r22, %r2;
	barrier.sync 	0;
	add.s32 	%r23, %r23, 1;
	@%p2 bra 	LBB0_1;
	fma.rn.f32 	%f53, %f70, %f16, %f4;
	mul.f32 	%f54, %f53, 0f3F7FBE77;
	shl.b64 	%rd16, %rd3, 4;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.f32 	%f55, [%rd17];
	fma.rn.f32 	%f56, %f54, %f16, %f55;
	st.global.f32 	[%rd17], %f56;
	fma.rn.f32 	%f57, %f69, %f16, %f5;
	mul.f32 	%f58, %f57, 0f3F7FBE77;
	ld.global.f32 	%f59, [%rd17+4];
	fma.rn.f32 	%f60, %f58, %f16, %f59;
	st.global.f32 	[%rd17+4], %f60;
	fma.rn.f32 	%f61, %f68, %f16, %f6;
	mul.f32 	%f62, %f61, 0f3F7FBE77;
	ld.global.f32 	%f63, [%rd17+8];
	fma.rn.f32 	%f64, %f62, %f16, %f63;
	st.global.f32 	[%rd17+8], %f64;
	st.global.f32 	[%rd4], %f54;
	st.global.f32 	[%rd5], %f58;
	st.global.f32 	[%rd6], %f62;
	ret;

}

Host-Compute (x86 CPU)

Note that the compiler would usually directly output a .bin file (ELF format). The output below comes from disassembling it with objdump -d. Also note that this has been compiled for the x86-5 target (AVX-512+).

nbody.bin:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <simplified_nbody>:
       0:	55                   	push   %rbp
       1:	48 89 e5             	mov    %rsp,%rbp
       4:	41 57                	push   %r15
       6:	41 56                	push   %r14
       8:	41 55                	push   %r13
       a:	41 54                	push   %r12
       c:	53                   	push   %rbx
       d:	48 83 e4 c0          	and    $0xffffffffffffffc0,%rsp
      11:	48 81 ec 40 09 00 00 	sub    $0x940,%rsp
      18:	48 8d 05 f9 ff ff ff 	lea    -0x7(%rip),%rax        # 18 <simplified_nbody+0x18>
      1f:	49 be 00 00 00 00 00 	movabs $0x0,%r14
      26:	00 00 00
      29:	48 89 4c 24 50       	mov    %rcx,0x50(%rsp)
      2e:	48 89 74 24 68       	mov    %rsi,0x68(%rsp)
      33:	48 89 7c 24 48       	mov    %rdi,0x48(%rsp)
      38:	49 01 c6             	add    %rax,%r14
      3b:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
      42:	00 00 00
      45:	49 8b 04 06          	mov    (%r14,%rax,1),%rax
      49:	8b 00                	mov    (%rax),%eax
      4b:	48 8d 0c 40          	lea    (%rax,%rax,2),%rcx
      4f:	48 89 c6             	mov    %rax,%rsi
      52:	48 c1 e6 04          	shl    $0x4,%rsi
      56:	48 89 74 24 58       	mov    %rsi,0x58(%rsp)
      5b:	48 8d 04 8a          	lea    (%rdx,%rcx,4),%rax
      5f:	c5 fa 10 04 8a       	vmovss (%rdx,%rcx,4),%xmm0
      64:	c5 f9 6e 54 8a 04    	vmovd  0x4(%rdx,%rcx,4),%xmm2
      6a:	c5 fa 10 4c 8a 08    	vmovss 0x8(%rdx,%rcx,4),%xmm1
      70:	48 89 44 24 60       	mov    %rax,0x60(%rsp)
      75:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
      7c:	00 00 00
      7f:	49 8b 04 06          	mov    (%r14,%rax,1),%rax
      83:	8b 18                	mov    (%rax),%ebx
      85:	c5 fa 11 44 24 3c    	vmovss %xmm0,0x3c(%rsp)
      8b:	c5 f9 7e 54 24 40    	vmovd  %xmm2,0x40(%rsp)
      91:	c5 fa 11 4c 24 44    	vmovss %xmm1,0x44(%rsp)
      97:	85 db                	test   %ebx,%ebx
      99:	0f 84 f9 16 00 00    	je     1798 <simplified_nbody+0x1798>
      9f:	48 8b 44 24 48       	mov    0x48(%rsp),%rax
      a4:	49 bd 00 00 00 00 00 	movabs $0x0,%r13
      ab:	00 00 00
      ae:	45 31 ff             	xor    %r15d,%r15d
      b1:	c5 fa 10 04 30       	vmovss (%rax,%rsi,1),%xmm0
      b6:	c5 fa 10 4c 30 04    	vmovss 0x4(%rax,%rsi,1),%xmm1
      bc:	c5 fa 10 54 30 08    	vmovss 0x8(%rax,%rsi,1),%xmm2
      c2:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
      c9:	00 00 00
      cc:	49 8b 04 06          	mov    (%r14,%rax,1),%rax
      d0:	48 89 44 24 78       	mov    %rax,0x78(%rsp)
      d5:	4b 8d 04 2e          	lea    (%r14,%r13,1),%rax
      d9:	48 89 44 24 70       	mov    %rax,0x70(%rsp)
      de:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
      e5:	00 00 00
      e8:	62 f2 7d 48 18 c0    	vbroadcastss %xmm0,%zmm0
      ee:	4d 8b 24 06          	mov    (%r14,%rax,1),%r12
      f2:	62 f2 7d 48 18 c9    	vbroadcastss %xmm1,%zmm1
      f8:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
      ff:	00 00 00
     102:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x700(%rsp)
     109:	1c
     10a:	62 f2 7d 48 18 c2    	vbroadcastss %xmm2,%zmm0
     110:	62 d2 fd 48 5b 14 06 	vbroadcasti64x4 (%r14,%rax,1),%zmm2
     117:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
     11e:	00 00 00
     121:	62 f1 7c 48 29 4c 24 	vmovaps %zmm1,0x6c0(%rsp)
     128:	1b
     129:	62 d2 fd 48 5b 0c 06 	vbroadcasti64x4 (%r14,%rax,1),%zmm1
     130:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
     137:	00 00 00
     13a:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x680(%rsp)
     141:	1a
     142:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
     146:	c5 f8 29 84 24 80 00 	vmovaps %xmm0,0x80(%rsp)
     14d:	00 00
     14f:	62 f1 fd 48 7f 54 24 	vmovdqa64 %zmm2,0x640(%rsp)
     156:	19
     157:	62 d2 fd 48 5b 14 06 	vbroadcasti64x4 (%r14,%rax,1),%zmm2
     15e:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
     165:	00 00 00
     168:	62 f1 fd 48 7f 4c 24 	vmovdqa64 %zmm1,0x840(%rsp)
     16f:	21
     170:	62 d2 7d 48 18 0c 06 	vbroadcastss (%r14,%rax,1),%zmm1
     177:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
     17e:	00 00 00
     181:	62 f1 fd 48 7f 54 24 	vmovdqa64 %zmm2,0x800(%rsp)
     188:	20
     189:	62 d2 fd 48 5b 14 06 	vbroadcasti64x4 (%r14,%rax,1),%zmm2
     190:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
     197:	00 00 00
     19a:	62 f1 7c 48 29 4c 24 	vmovaps %zmm1,0x600(%rsp)
     1a1:	18
     1a2:	62 d2 7d 48 18 0c 06 	vbroadcastss (%r14,%rax,1),%zmm1
     1a9:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
     1b0:	00 00 00
     1b3:	62 d2 7d 48 18 04 06 	vbroadcastss (%r14,%rax,1),%zmm0
     1ba:	62 f1 fd 48 7f 54 24 	vmovdqa64 %zmm2,0x7c0(%rsp)
     1c1:	1f
     1c2:	62 f1 7c 48 29 4c 24 	vmovaps %zmm1,0x780(%rsp)
     1c9:	1e
     1ca:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x740(%rsp)
     1d1:	1d
     1d2:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
     1d6:	c5 f8 29 84 24 c0 00 	vmovaps %xmm0,0xc0(%rsp)
     1dd:	00 00
     1df:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
     1e3:	c5 f8 29 84 24 00 01 	vmovaps %xmm0,0x100(%rsp)
     1ea:	00 00
     1ec:	0f 1f 40 00          	nopl   0x0(%rax)
     1f0:	48 8b 44 24 78       	mov    0x78(%rsp),%rax
     1f5:	48 8b 54 24 48       	mov    0x48(%rsp),%rdx
     1fa:	8b 00                	mov    (%rax),%eax
     1fc:	42 8d 0c 38          	lea    (%rax,%r15,1),%ecx
     200:	48 c1 e0 04          	shl    $0x4,%rax
     204:	48 c1 e1 04          	shl    $0x4,%rcx
     208:	c5 f8 10 04 0a       	vmovups (%rdx,%rcx,1),%xmm0
     20d:	48 8b 4c 24 70       	mov    0x70(%rsp),%rcx
     212:	c5 f8 29 04 08       	vmovaps %xmm0,(%rax,%rcx,1)
     217:	c5 f8 77             	vzeroupper
     21a:	41 ff d4             	call   *%r12
     21d:	62 91 7c 48 28 5c 2e 	vmovaps 0x80(%r14,%r13,1),%zmm3
     224:	02
     225:	62 f1 7c 48 28 64 24 	vmovaps 0x640(%rsp),%zmm4
     22c:	19
     22d:	62 81 7c 48 28 5c 2e 	vmovaps 0xc0(%r14,%r13,1),%zmm19
     234:	03
     235:	62 91 7c 48 28 54 2e 	vmovaps 0x180(%r14,%r13,1),%zmm2
     23c:	06
     23d:	62 11 7c 48 28 4c 2e 	vmovaps 0x100(%r14,%r13,1),%zmm9
     244:	04
     245:	62 11 7c 48 28 6c 2e 	vmovaps 0x140(%r14,%r13,1),%zmm13
     24c:	05
     24d:	62 81 7c 48 28 4c 2e 	vmovaps 0x1c0(%r14,%r13,1),%zmm17
     254:	07
     255:	62 71 7c 48 28 74 24 	vmovaps 0x800(%rsp),%zmm14
     25c:	20
     25d:	62 91 7c 48 28 04 2e 	vmovaps (%r14,%r13,1),%zmm0
     264:	62 81 7c 48 28 54 2e 	vmovaps 0x40(%r14,%r13,1),%zmm18
     26b:	01
     26c:	62 f1 7c 48 28 74 24 	vmovaps 0x7c0(%rsp),%zmm6
     273:	1f
     274:	62 01 7c 48 28 44 2e 	vmovaps 0x280(%r14,%r13,1),%zmm24
     27b:	0a
     27c:	62 81 7c 48 28 74 2e 	vmovaps 0x200(%r14,%r13,1),%zmm22
     283:	08
     284:	62 81 7c 48 28 6c 2e 	vmovaps 0x240(%r14,%r13,1),%zmm21
     28b:	09
     28c:	62 81 7c 48 28 7c 2e 	vmovaps 0x2c0(%r14,%r13,1),%zmm23
     293:	0b
     294:	62 01 7c 48 28 64 2e 	vmovaps 0x380(%r14,%r13,1),%zmm28
     29b:	0e
     29c:	62 01 7c 48 28 54 2e 	vmovaps 0x300(%r14,%r13,1),%zmm26
     2a3:	0c
     2a4:	62 01 7c 48 28 5c 2e 	vmovaps 0x3c0(%r14,%r13,1),%zmm27
     2ab:	0f
     2ac:	62 f1 7c 48 28 cb    	vmovaps %zmm3,%zmm1
     2b2:	62 e1 7c 48 28 e2    	vmovaps %zmm2,%zmm20
     2b8:	62 d1 7c 48 28 e9    	vmovaps %zmm9,%zmm5
     2be:	62 61 7c 48 28 ca    	vmovaps %zmm2,%zmm25
     2c4:	62 f1 7c 48 28 f8    	vmovaps %zmm0,%zmm7
     2ca:	62 71 7c 48 28 fb    	vmovaps %zmm3,%zmm15
     2d0:	62 e1 7c 48 28 c0    	vmovaps %zmm0,%zmm16
     2d6:	62 71 7c 48 28 c3    	vmovaps %zmm3,%zmm8
     2dc:	62 71 7c 48 28 e0    	vmovaps %zmm0,%zmm12
     2e2:	62 71 7c 48 28 d2    	vmovaps %zmm2,%zmm10
     2e8:	62 b2 4d 48 7f db    	vpermt2ps %zmm19,%zmm6,%zmm3
     2ee:	62 b2 4d 48 7f c2    	vpermt2ps %zmm18,%zmm6,%zmm0
     2f4:	62 61 7c 48 28 f4    	vmovaps %zmm4,%zmm30
     2fa:	62 b2 4d 48 7f d1    	vpermt2ps %zmm17,%zmm6,%zmm2
     300:	62 51 7c 48 28 d9    	vmovaps %zmm9,%zmm11
     306:	62 01 7c 48 28 e8    	vmovaps %zmm24,%zmm29
     30c:	62 01 7c 48 28 fc    	vmovaps %zmm28,%zmm31
     312:	62 b2 5d 48 7f cb    	vpermt2ps %zmm19,%zmm4,%zmm1
     318:	62 a2 5d 48 7f e1    	vpermt2ps %zmm17,%zmm4,%zmm20
     31e:	62 d2 5d 48 7f ed    	vpermt2ps %zmm13,%zmm4,%zmm5
     324:	62 22 0d 48 7f c9    	vpermt2ps %zmm17,%zmm14,%zmm25
     32a:	62 b2 5d 48 7f fa    	vpermt2ps %zmm18,%zmm4,%zmm7
     330:	62 d1 7c 48 28 e1    	vmovaps %zmm9,%zmm4
     336:	62 32 0d 48 7f fb    	vpermt2ps %zmm19,%zmm14,%zmm15
     33c:	62 a2 0d 48 7f c2    	vpermt2ps %zmm18,%zmm14,%zmm16
     342:	62 52 4d 48 7f cd    	vpermt2ps %zmm13,%zmm6,%zmm9
     348:	62 52 0d 48 7f dd    	vpermt2ps %zmm13,%zmm14,%zmm11
     34e:	62 91 7c 48 28 f2    	vmovaps %zmm26,%zmm6
     354:	62 22 0d 40 7f ef    	vpermt2ps %zmm23,%zmm30,%zmm29
     35a:	62 f3 fd 48 23 c3 e4 	vshuff64x2 $0xe4,%zmm3,%zmm0,%zmm0
     361:	62 91 7c 48 28 dc    	vmovaps %zmm28,%zmm3
     367:	62 f1 7c 48 29 4c 24 	vmovaps %zmm1,0x140(%rsp)
     36e:	05
     36f:	62 f1 7c 48 28 4c 24 	vmovaps 0x840(%rsp),%zmm1
     376:	21
     377:	62 b3 d5 48 23 ec e4 	vshuff64x2 $0xe4,%zmm20,%zmm5,%zmm5
     37e:	62 61 7c 48 29 4c 24 	vmovaps %zmm25,0x280(%rsp)
     385:	0a
     386:	62 01 7c 48 28 4c 2e 	vmovaps 0x340(%r14,%r13,1),%zmm25
     38d:	0d
     38e:	62 a1 7c 48 28 e6    	vmovaps %zmm22,%zmm20
     394:	62 f3 b5 48 23 d2 e4 	vshuff64x2 $0xe4,%zmm2,%zmm9,%zmm2
     39b:	62 71 7c 48 28 4c 24 	vmovaps 0x640(%rsp),%zmm9
     3a2:	19
     3a3:	62 92 0d 48 7f db    	vpermt2ps %zmm27,%zmm14,%zmm3
     3a9:	62 f3 c5 48 23 7c 24 	vshuff64x2 $0xe4,0x140(%rsp),%zmm7,%zmm7
     3b0:	05 e4
     3b2:	62 a2 0d 48 7f e5    	vpermt2ps %zmm21,%zmm14,%zmm20
     3b8:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x140(%rsp)
     3bf:	05
     3c0:	62 f1 fd 48 29 6c 24 	vmovapd %zmm5,0x4c0(%rsp)
     3c7:	13
     3c8:	62 f1 7c 48 28 6c 24 	vmovaps 0x7c0(%rsp),%zmm5
     3cf:	1f
     3d0:	62 f1 fd 48 29 54 24 	vmovapd %zmm2,0x500(%rsp)
     3d7:	14
     3d8:	62 32 75 48 7f c3    	vpermt2ps %zmm19,%zmm1,%zmm8
     3de:	62 32 75 48 7f e2    	vpermt2ps %zmm18,%zmm1,%zmm12
     3e4:	62 a1 7c 48 28 de    	vmovaps %zmm22,%zmm19
     3ea:	62 81 7c 48 28 d0    	vmovaps %zmm24,%zmm18
     3f0:	62 32 75 48 7f d1    	vpermt2ps %zmm17,%zmm1,%zmm10
     3f6:	62 81 7c 48 28 c8    	vmovaps %zmm24,%zmm17
     3fc:	62 d2 75 48 7f e5    	vpermt2ps %zmm13,%zmm1,%zmm4
     402:	62 11 7c 48 28 ee    	vmovaps %zmm30,%zmm13
     408:	62 21 7c 48 28 f6    	vmovaps %zmm22,%zmm30
     40e:	62 a2 0d 48 7f d7    	vpermt2ps %zmm23,%zmm14,%zmm18
     414:	62 a2 75 48 7f cf    	vpermt2ps %zmm23,%zmm1,%zmm17
     41a:	62 a2 75 48 7f dd    	vpermt2ps %zmm21,%zmm1,%zmm19
     420:	62 02 15 48 7f fb    	vpermt2ps %zmm27,%zmm13,%zmm31
     426:	62 92 15 48 7f f1    	vpermt2ps %zmm25,%zmm13,%zmm6
     42c:	62 22 15 48 7f f5    	vpermt2ps %zmm21,%zmm13,%zmm30
     432:	62 11 7c 48 28 ec    	vmovaps %zmm28,%zmm13
     438:	62 f1 fd 48 29 7c 24 	vmovapd %zmm7,0x240(%rsp)
     43f:	09
     440:	62 f3 a5 48 23 7c 24 	vshuff64x2 $0xe4,0x280(%rsp),%zmm11,%zmm7
     447:	0a e4
     449:	62 02 55 48 7f e3    	vpermt2ps %zmm27,%zmm5,%zmm28
     44f:	62 22 55 48 7f c7    	vpermt2ps %zmm23,%zmm5,%zmm24
     455:	62 a2 55 48 7f f5    	vpermt2ps %zmm21,%zmm5,%zmm22
     45b:	62 12 75 48 7f eb    	vpermt2ps %zmm27,%zmm1,%zmm13
     461:	62 81 7c 48 28 7c 2e 	vmovaps 0x4c0(%r14,%r13,1),%zmm23
     468:	13
     469:	62 e1 7c 48 28 6c 24 	vmovaps 0x6c0(%rsp),%zmm21
     470:	1b
     471:	62 d3 dd 48 23 c2 e4 	vshuff64x2 $0xe4,%zmm10,%zmm4,%zmm0
     478:	62 53 fd 40 23 d7 e4 	vshuff64x2 $0xe4,%zmm15,%zmm16,%zmm10
     47f:	62 11 7c 48 28 fa    	vmovaps %zmm26,%zmm15
     485:	62 53 9d 48 23 c0 e4 	vshuff64x2 $0xe4,%zmm8,%zmm12,%zmm8
     48c:	62 11 7c 48 28 e2    	vmovaps %zmm26,%zmm12
     492:	62 02 55 48 7f d1    	vpermt2ps %zmm25,%zmm5,%zmm26
     498:	62 81 7c 48 28 44 2e 	vmovaps 0x540(%r14,%r13,1),%zmm16
     49f:	15
     4a0:	62 33 e5 40 23 d9 e4 	vshuff64x2 $0xe4,%zmm17,%zmm19,%zmm11
     4a7:	62 a3 dd 40 23 d2 e4 	vshuff64x2 $0xe4,%zmm18,%zmm20,%zmm18
     4ae:	62 81 7c 48 28 64 2e 	vmovaps 0x580(%r14,%r13,1),%zmm20
     4b5:	16
     4b6:	62 81 7c 48 28 4c 2e 	vmovaps 0x500(%r14,%r13,1),%zmm17
     4bd:	14
     4be:	62 12 0d 48 7f f9    	vpermt2ps %zmm25,%zmm14,%zmm15
     4c4:	62 12 75 48 7f e1    	vpermt2ps %zmm25,%zmm1,%zmm12
     4ca:	62 01 7c 48 28 4c 2e 	vmovaps 0x5c0(%r14,%r13,1),%zmm25
     4d1:	17
     4d2:	62 93 8d 40 23 d5 e4 	vshuff64x2 $0xe4,%zmm29,%zmm30,%zmm2
     4d9:	62 e1 7c 48 28 5c 24 	vmovaps 0x780(%rsp),%zmm19
     4e0:	1e
     4e1:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x440(%rsp)
     4e8:	11
     4e9:	62 93 cd 48 23 c7 e4 	vshuff64x2 $0xe4,%zmm31,%zmm6,%zmm0
     4f0:	62 f1 fd 48 29 54 24 	vmovapd %zmm2,0x200(%rsp)
     4f7:	08
     4f8:	62 f1 7c 48 28 d5    	vmovaps %zmm5,%zmm2
     4fe:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x400(%rsp)
     505:	10
     506:	62 93 cd 40 23 c0 e4 	vshuff64x2 $0xe4,%zmm24,%zmm22,%zmm0
     50d:	62 81 7c 48 28 74 2e 	vmovaps 0x400(%r14,%r13,1),%zmm22
     514:	10
     515:	62 01 7c 48 28 44 2e 	vmovaps 0x480(%r14,%r13,1),%zmm24
     51c:	12
     51d:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x480(%rsp)
     524:	12
     525:	62 93 ad 40 23 e4 e4 	vshuff64x2 $0xe4,%zmm28,%zmm26,%zmm4
     52c:	62 d3 9d 48 23 ed e4 	vshuff64x2 $0xe4,%zmm13,%zmm12,%zmm5
     533:	62 f3 85 48 23 db e4 	vshuff64x2 $0xe4,%zmm3,%zmm15,%zmm3
     53a:	62 21 7c 48 28 dc    	vmovaps %zmm20,%zmm27
     540:	62 21 7c 48 28 e1    	vmovaps %zmm17,%zmm28
     546:	62 f1 fd 48 29 64 24 	vmovapd %zmm4,0x280(%rsp)
     54d:	0a
     54e:	62 91 7c 48 28 64 2e 	vmovaps 0x440(%r14,%r13,1),%zmm4
     555:	11
     556:	62 21 7c 48 28 f4    	vmovaps %zmm20,%zmm30
     55c:	62 21 7c 48 28 f9    	vmovaps %zmm17,%zmm31
     562:	62 02 35 48 7f d9    	vpermt2ps %zmm25,%zmm9,%zmm27
     568:	62 22 35 48 7f e0    	vpermt2ps %zmm16,%zmm9,%zmm28
     56e:	62 02 0d 48 7f f1    	vpermt2ps %zmm25,%zmm14,%zmm30
     574:	62 22 0d 48 7f f8    	vpermt2ps %zmm16,%zmm14,%zmm31
     57a:	62 01 7c 48 28 d0    	vmovaps %zmm24,%zmm26
     580:	62 31 7c 48 28 ee    	vmovaps %zmm22,%zmm13
     586:	62 11 7c 48 28 f8    	vmovaps %zmm24,%zmm15
     58c:	62 21 7c 48 28 ee    	vmovaps %zmm22,%zmm29
     592:	62 22 35 48 7f d7    	vpermt2ps %zmm23,%zmm9,%zmm26
     598:	62 32 75 48 7f ff    	vpermt2ps %zmm23,%zmm1,%zmm15
     59e:	62 93 9d 40 23 f3 e4 	vshuff64x2 $0xe4,%zmm27,%zmm28,%zmm6
     5a5:	62 72 35 48 7f ec    	vpermt2ps %zmm4,%zmm9,%zmm13
     5ab:	62 21 7c 48 28 e4    	vmovaps %zmm20,%zmm28
     5b1:	62 62 0d 48 7f ec    	vpermt2ps %zmm4,%zmm14,%zmm29
     5b7:	62 02 75 48 7f e1    	vpermt2ps %zmm25,%zmm1,%zmm28
     5bd:	62 f1 fd 48 29 74 24 	vmovapd %zmm6,0x1c0(%rsp)
     5c4:	07
     5c5:	62 b1 7c 48 28 f6    	vmovaps %zmm22,%zmm6
     5cb:	62 f2 75 48 7f f4    	vpermt2ps %zmm4,%zmm1,%zmm6
     5d1:	62 93 95 48 23 c2 e4 	vshuff64x2 $0xe4,%zmm26,%zmm13,%zmm0
     5d8:	62 71 7c 48 28 e9    	vmovaps %zmm1,%zmm13
     5de:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x180(%rsp)
     5e5:	06
     5e6:	62 d3 cd 48 23 c7 e4 	vshuff64x2 $0xe4,%zmm15,%zmm6,%zmm0
     5ed:	62 f1 7c 48 28 74 24 	vmovaps 0x600(%rsp),%zmm6
     5f4:	18
     5f5:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x300(%rsp)
     5fc:	0c
     5fd:	62 b1 7c 48 28 c1    	vmovaps %zmm17,%zmm0
     603:	62 b2 75 48 7f c0    	vpermt2ps %zmm16,%zmm1,%zmm0
     609:	62 f1 7c 48 28 4c 24 	vmovaps 0x240(%rsp),%zmm1
     610:	09
     611:	62 93 fd 48 23 c4 e4 	vshuff64x2 $0xe4,%zmm28,%zmm0,%zmm0
     618:	62 61 7c 48 28 e2    	vmovaps %zmm2,%zmm28
     61e:	62 e2 1d 40 7f f4    	vpermt2ps %zmm4,%zmm28,%zmm22
     624:	62 f1 7c 48 28 64 24 	vmovaps 0x4c0(%rsp),%zmm4
     62b:	13
     62c:	62 a2 1d 40 7f c8    	vpermt2ps %zmm16,%zmm28,%zmm17
     632:	62 82 1d 40 7f e1    	vpermt2ps %zmm25,%zmm28,%zmm20
     638:	62 e1 7c 48 28 44 24 	vmovaps 0x1c0(%rsp),%zmm16
     63f:	07
     640:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x2c0(%rsp)
     647:	0b
     648:	62 91 7c 48 28 c0    	vmovaps %zmm24,%zmm0
     64e:	62 22 6d 48 7f c7    	vpermt2ps %zmm23,%zmm2,%zmm24
     654:	62 f1 7c 48 28 54 24 	vmovaps 0x680(%rsp),%zmm2
     65b:	1a
     65c:	62 b2 0d 48 7f c7    	vpermt2ps %zmm23,%zmm14,%zmm0
     662:	62 e1 7c 48 28 7c 24 	vmovaps 0x740(%rsp),%zmm23
     669:	1d
     66a:	62 a3 f5 40 23 e4 e4 	vshuff64x2 $0xe4,%zmm20,%zmm17,%zmm20
     671:	62 83 cd 40 23 f0 e4 	vshuff64x2 $0xe4,%zmm24,%zmm22,%zmm22
     678:	62 f3 95 40 23 c0 e4 	vshuff64x2 $0xe4,%zmm0,%zmm29,%zmm0
     67f:	62 03 85 40 23 ee e4 	vshuff64x2 $0xe4,%zmm30,%zmm31,%zmm29
     686:	62 21 3c 48 5c f5    	vsubps %zmm21,%zmm8,%zmm30
     68c:	62 71 7c 48 28 44 24 	vmovaps 0x440(%rsp),%zmm8
     693:	11
     694:	62 61 2c 48 5c fa    	vsubps %zmm2,%zmm10,%zmm31
     69a:	62 61 44 48 5c da    	vsubps %zmm2,%zmm7,%zmm27
     6a0:	62 b1 7c 48 28 fb    	vmovaps %zmm19,%zmm7
     6a6:	62 f1 64 48 5c da    	vsubps %zmm2,%zmm3,%zmm3
     6ac:	62 f1 7c 48 29 5c 24 	vmovaps %zmm3,0x240(%rsp)
     6b3:	09
     6b4:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x5c0(%rsp)
     6bb:	17
     6bc:	62 f1 7c 48 28 44 24 	vmovaps 0x700(%rsp),%zmm0
     6c3:	1c
     6c4:	62 21 3c 48 5c d5    	vsubps %zmm21,%zmm8,%zmm26
     6ca:	62 71 74 48 5c e0    	vsubps %zmm0,%zmm1,%zmm12
     6d0:	62 f1 5c 48 5c e0    	vsubps %zmm0,%zmm4,%zmm4
     6d6:	62 e1 7c 40 5c c0    	vsubps %zmm0,%zmm16,%zmm16
     6dc:	62 51 7c 48 28 cc    	vmovaps %zmm12,%zmm9
     6e2:	62 71 7c 48 28 c4    	vmovaps %zmm4,%zmm8
     6e8:	62 e1 7c 48 29 44 24 	vmovaps %zmm16,0x340(%rsp)
     6ef:	0d
     6f0:	62 72 1d 48 a8 ce    	vfmadd213ps %zmm6,%zmm12,%zmm9
     6f6:	62 72 5d 48 a8 c6    	vfmadd213ps %zmm6,%zmm4,%zmm8
     6fc:	62 12 0d 40 b8 ce    	vfmadd231ps %zmm30,%zmm30,%zmm9
     702:	62 12 2d 40 b8 c2    	vfmadd231ps %zmm26,%zmm26,%zmm8
     708:	62 12 05 40 b8 cf    	vfmadd231ps %zmm31,%zmm31,%zmm9
     70e:	62 12 25 40 b8 c3    	vfmadd231ps %zmm27,%zmm27,%zmm8
     714:	62 52 7d 48 4e d1    	vrsqrt14ps %zmm9,%zmm10
     71a:	62 52 7d 48 4e f8    	vrsqrt14ps %zmm8,%zmm15
     720:	62 51 34 48 59 ca    	vmulps %zmm10,%zmm9,%zmm9
     726:	62 51 3c 48 59 c7    	vmulps %zmm15,%zmm8,%zmm8
     72c:	62 32 2d 48 a8 cb    	vfmadd213ps %zmm19,%zmm10,%zmm9
     732:	62 31 2c 48 59 d7    	vmulps %zmm23,%zmm10,%zmm10
     738:	62 32 05 48 a8 c3    	vfmadd213ps %zmm19,%zmm15,%zmm8
     73e:	62 51 2c 48 59 d1    	vmulps %zmm9,%zmm10,%zmm10
     744:	62 31 04 48 59 cf    	vmulps %zmm23,%zmm15,%zmm9
     74a:	62 71 7c 48 28 7c 24 	vmovaps 0x200(%rsp),%zmm15
     751:	08
     752:	62 d1 34 48 59 c8    	vmulps %zmm8,%zmm9,%zmm1
     758:	62 31 24 48 5c cd    	vsubps %zmm21,%zmm11,%zmm9
     75e:	62 71 6c 40 5c c2    	vsubps %zmm2,%zmm18,%zmm8
     764:	62 71 7c 48 29 4c 24 	vmovaps %zmm9,0x200(%rsp)
     76b:	08
     76c:	62 71 7c 48 29 44 24 	vmovaps %zmm8,0x3c0(%rsp)
     773:	0f
     774:	62 e1 04 48 5c d8    	vsubps %zmm0,%zmm15,%zmm19
     77a:	62 31 7c 48 28 db    	vmovaps %zmm19,%zmm11
     780:	62 72 65 40 a8 de    	vfmadd213ps %zmm6,%zmm19,%zmm11
     786:	62 52 35 48 b8 d9    	vfmadd231ps %zmm9,%zmm9,%zmm11
     78c:	62 71 7c 48 28 4c 24 	vmovaps 0x400(%rsp),%zmm9
     793:	10
     794:	62 52 3d 48 b8 d8    	vfmadd231ps %zmm8,%zmm8,%zmm11
     79a:	62 31 54 48 5c c5    	vsubps %zmm21,%zmm5,%zmm8
     7a0:	62 c2 7d 48 4e d3    	vrsqrt14ps %zmm11,%zmm18
     7a6:	62 71 7c 48 29 44 24 	vmovaps %zmm8,0x380(%rsp)
     7ad:	0e
     7ae:	62 31 24 48 59 da    	vmulps %zmm18,%zmm11,%zmm11
     7b4:	62 72 6d 40 a8 df    	vfmadd213ps %zmm7,%zmm18,%zmm11
     7ba:	62 a1 6c 40 59 d7    	vmulps %zmm23,%zmm18,%zmm18
     7c0:	62 c1 6c 40 59 d3    	vmulps %zmm11,%zmm18,%zmm18
     7c6:	62 61 6c 40 59 44 24 	vmulps 0x480(%rsp),%zmm18,%zmm24
     7cd:	12
     7ce:	62 71 34 48 5c f8    	vsubps %zmm0,%zmm9,%zmm15
     7d4:	62 d1 7c 48 28 ef    	vmovaps %zmm15,%zmm5
     7da:	62 f2 05 48 a8 ee    	vfmadd213ps %zmm6,%zmm15,%zmm5
     7e0:	62 d2 3d 48 b8 e8    	vfmadd231ps %zmm8,%zmm8,%zmm5
     7e6:	62 71 74 48 59 44 24 	vmulps 0x500(%rsp),%zmm1,%zmm8
     7ed:	14
     7ee:	62 f1 74 48 59 c9    	vmulps %zmm1,%zmm1,%zmm1
     7f4:	62 f2 65 48 b8 eb    	vfmadd231ps %zmm3,%zmm3,%zmm5
     7fa:	62 f1 2c 48 59 5c 24 	vmulps 0x140(%rsp),%zmm10,%zmm3
     801:	05
     802:	62 51 2c 48 59 d2    	vmulps %zmm10,%zmm10,%zmm10
     808:	62 72 7d 48 4e dd    	vrsqrt14ps %zmm5,%zmm11
     80e:	62 d1 54 48 59 eb    	vmulps %zmm11,%zmm5,%zmm5
     814:	62 f2 25 48 a8 ef    	vfmadd213ps %zmm7,%zmm11,%zmm5
     81a:	62 31 24 48 59 df    	vmulps %zmm23,%zmm11,%zmm11
     820:	62 51 74 48 59 c0    	vmulps %zmm8,%zmm1,%zmm8
     826:	62 91 7c 48 28 4c 2e 	vmovaps 0x780(%r14,%r13,1),%zmm1
     82d:	1e
     82e:	62 61 2c 48 59 cb    	vmulps %zmm3,%zmm10,%zmm25
     834:	c4 41 28 57 d2       	vxorps %xmm10,%xmm10,%xmm10
     839:	c4 63 29 0c 8c 24 00 	vblendps $0x1,0x100(%rsp),%xmm10,%xmm9
     840:	01 00 00 01
     844:	62 f1 24 48 59 ed    	vmulps %zmm5,%zmm11,%zmm5
     84a:	c4 63 29 0c 9c 24 c0 	vblendps $0x1,0xc0(%rsp),%xmm10,%xmm11
     851:	00 00 00 01
     855:	c4 e3 29 0c 9c 24 80 	vblendps $0x1,0x80(%rsp),%xmm10,%xmm3
     85c:	00 00 00 01
     860:	62 71 7c 48 28 54 24 	vmovaps 0x180(%rsp),%zmm10
     867:	06
     868:	62 e1 2c 48 5c c8    	vsubps %zmm0,%zmm10,%zmm17
     86e:	62 f1 7c 48 28 44 24 	vmovaps 0x300(%rsp),%zmm0
     875:	0c
     876:	62 71 7c 48 28 54 24 	vmovaps 0x2c0(%rsp),%zmm10
     87d:	0b
     87e:	62 f1 7c 48 29 5c 24 	vmovaps %zmm3,0x100(%rsp)
     885:	04
     886:	62 b1 6c 40 59 da    	vmulps %zmm18,%zmm18,%zmm3
     88c:	62 e1 54 48 59 54 24 	vmulps 0x280(%rsp),%zmm5,%zmm18
     893:	0a
     894:	62 f1 54 48 59 ed    	vmulps %zmm5,%zmm5,%zmm5
     89a:	62 12 35 40 b8 de    	vfmadd231ps %zmm30,%zmm25,%zmm11
     8a0:	62 01 7c 48 28 74 2e 	vmovaps 0x600(%r14,%r13,1),%zmm30
     8a7:	18
     8a8:	62 52 35 40 b8 cc    	vfmadd231ps %zmm12,%zmm25,%zmm9
     8ae:	62 01 64 48 59 c0    	vmulps %zmm24,%zmm3,%zmm24
     8b4:	62 f1 14 40 5c da    	vsubps %zmm2,%zmm29,%zmm3
     8ba:	62 21 7c 48 28 e8    	vmovaps %zmm16,%zmm29
     8c0:	62 e1 7c 48 29 4c 24 	vmovaps %zmm17,0x80(%rsp)
     8c7:	02
     8c8:	62 e2 75 40 a8 ce    	vfmadd213ps %zmm6,%zmm17,%zmm17
     8ce:	62 62 15 40 a8 ee    	vfmadd213ps %zmm6,%zmm29,%zmm29
     8d4:	62 f1 3c 48 59 f4    	vmulps %zmm4,%zmm8,%zmm6
     8da:	62 32 3d 40 b8 cb    	vfmadd231ps %zmm19,%zmm24,%zmm9
     8e0:	62 81 7c 48 28 5c 2e 	vmovaps 0x700(%r14,%r13,1),%zmm19
     8e7:	1c
     8e8:	62 f1 7c 48 29 5c 24 	vmovaps %zmm3,0x1c0(%rsp)
     8ef:	07
     8f0:	62 a1 54 48 59 c2    	vmulps %zmm18,%zmm5,%zmm16
     8f6:	62 e1 7c 48 28 54 24 	vmovaps 0x640(%rsp),%zmm18
     8fd:	19
     8fe:	62 d2 7d 40 b8 f7    	vfmadd231ps %zmm15,%zmm16,%zmm6
     904:	62 11 7c 48 28 fc    	vmovaps %zmm28,%zmm15
     90a:	62 b1 7c 48 5c c5    	vsubps %zmm21,%zmm0,%zmm0
     910:	62 31 2c 48 5c d5    	vsubps %zmm21,%zmm10,%zmm10
     916:	62 e1 7c 48 28 6c 24 	vmovaps 0x5c0(%rsp),%zmm21
     91d:	17
     91e:	62 11 7c 48 28 e6    	vmovaps %zmm30,%zmm12
     924:	62 e2 7d 48 b8 c8    	vfmadd231ps %zmm0,%zmm0,%zmm17
     92a:	62 42 2d 48 b8 ea    	vfmadd231ps %zmm10,%zmm10,%zmm29
     930:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0xc0(%rsp)
     937:	03
     938:	62 71 7c 48 29 54 24 	vmovaps %zmm10,0x140(%rsp)
     93f:	05
     940:	62 51 7c 48 28 d3    	vmovaps %zmm11,%zmm10
     946:	62 71 7c 48 28 de    	vmovaps %zmm6,%zmm11
     94c:	62 62 65 48 b8 eb    	vfmadd231ps %zmm3,%zmm3,%zmm29
     952:	62 b1 7c 48 28 f3    	vmovaps %zmm19,%zmm6
     958:	62 92 7d 48 4e c5    	vrsqrt14ps %zmm29,%zmm0
     95e:	62 f1 14 40 59 e8    	vmulps %zmm0,%zmm29,%zmm5
     964:	62 f2 7d 48 a8 ef    	vfmadd213ps %zmm7,%zmm0,%zmm5
     96a:	62 e1 54 40 5c ea    	vsubps %zmm2,%zmm21,%zmm21
     970:	62 a2 55 40 b8 cd    	vfmadd231ps %zmm21,%zmm21,%zmm17
     976:	62 e1 7c 48 29 6c 24 	vmovaps %zmm21,0x180(%rsp)
     97d:	06
     97e:	62 81 3c 48 59 ea    	vmulps %zmm26,%zmm8,%zmm21
     984:	62 01 7c 48 28 54 2e 	vmovaps 0x940(%r14,%r13,1),%zmm26
     98b:	25
     98c:	62 b2 7d 48 4e d1    	vrsqrt14ps %zmm17,%zmm2
     992:	62 e2 7d 40 b8 6c 24 	vfmadd231ps 0x380(%rsp),%zmm16,%zmm21
     999:	0e
     99a:	62 f1 74 40 59 e2    	vmulps %zmm2,%zmm17,%zmm4
     9a0:	62 f2 6d 48 a8 e7    	vfmadd213ps %zmm7,%zmm2,%zmm4
     9a6:	62 b1 6c 48 59 d7    	vmulps %zmm23,%zmm2,%zmm2
     9ac:	62 f1 6c 48 59 d4    	vmulps %zmm4,%zmm2,%zmm2
     9b2:	62 b1 7c 48 59 e7    	vmulps %zmm23,%zmm0,%zmm4
     9b8:	62 81 3c 48 59 fb    	vmulps %zmm27,%zmm8,%zmm23
     9be:	62 51 7c 48 28 c1    	vmovaps %zmm9,%zmm8
     9c4:	62 01 7c 48 28 5c 2e 	vmovaps 0xb40(%r14,%r13,1),%zmm27
     9cb:	2d
     9cc:	62 61 5c 48 59 ed    	vmulps %zmm5,%zmm4,%zmm29
     9d2:	62 f1 7c 48 28 6c 24 	vmovaps 0x100(%rsp),%zmm5
     9d9:	04
     9da:	62 f1 6c 48 59 e2    	vmulps %zmm2,%zmm2,%zmm4
     9e0:	62 f1 4c 40 59 d2    	vmulps %zmm2,%zmm22,%zmm2
     9e6:	62 81 7c 48 28 74 2e 	vmovaps 0x640(%r14,%r13,1),%zmm22
     9ed:	19
     9ee:	62 e1 5c 48 59 ca    	vmulps %zmm2,%zmm4,%zmm17
     9f4:	62 91 7c 48 28 54 2e 	vmovaps 0x6c0(%r14,%r13,1),%zmm2
     9fb:	1b
     9fc:	62 91 7c 48 28 64 2e 	vmovaps 0x740(%r14,%r13,1),%zmm4
     a03:	1d
     a04:	62 72 75 40 b8 44 24 	vfmadd231ps 0x80(%rsp),%zmm17,%zmm8
     a0b:	02
     a0c:	62 71 7c 48 29 44 24 	vmovaps %zmm8,0x80(%rsp)
     a13:	02
     a14:	62 71 7c 48 28 44 24 	vmovaps 0x600(%rsp),%zmm8
     a1b:	18
     a1c:	62 92 35 40 b8 ef    	vfmadd231ps %zmm31,%zmm25,%zmm5
     a22:	62 01 7c 48 28 7c 2e 	vmovaps 0x680(%r14,%r13,1),%zmm31
     a29:	1a
     a2a:	62 01 7c 48 28 4c 2e 	vmovaps 0x7c0(%r14,%r13,1),%zmm25
     a31:	1f
     a32:	62 32 6d 40 7f e6    	vpermt2ps %zmm22,%zmm18,%zmm12
     a38:	62 f2 15 48 7f f4    	vpermt2ps %zmm4,%zmm13,%zmm6
     a3e:	62 f2 3d 40 b8 6c 24 	vfmadd231ps 0x3c0(%rsp),%zmm24,%zmm5
     a45:	0f
     a46:	62 f2 75 40 b8 6c 24 	vfmadd231ps 0x180(%rsp),%zmm17,%zmm5
     a4d:	06
     a4e:	62 91 7c 48 28 ff    	vmovaps %zmm31,%zmm7
     a54:	62 11 7c 48 28 cf    	vmovaps %zmm31,%zmm9
     a5a:	62 f2 6d 40 7f fa    	vpermt2ps %zmm2,%zmm18,%zmm7
     a60:	62 72 15 48 7f ca    	vpermt2ps %zmm2,%zmm13,%zmm9
     a66:	62 f1 7c 48 29 6c 24 	vmovaps %zmm5,0x100(%rsp)
     a6d:	04
     a6e:	62 f1 7c 48 28 6c 24 	vmovaps 0x700(%rsp),%zmm5
     a75:	1c
     a76:	62 f3 9d 48 23 c7 e4 	vshuff64x2 $0xe4,%zmm7,%zmm12,%zmm0
     a7d:	62 f1 7c 48 28 f9    	vmovaps %zmm1,%zmm7
     a83:	62 31 7c 48 28 e3    	vmovaps %zmm19,%zmm12
     a89:	62 92 6d 40 7f f9    	vpermt2ps %zmm25,%zmm18,%zmm7
     a8f:	62 72 6d 40 7f e4    	vpermt2ps %zmm4,%zmm18,%zmm12
     a95:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x4c0(%rsp)
     a9c:	13
     a9d:	62 f3 9d 48 23 df e4 	vshuff64x2 $0xe4,%zmm7,%zmm12,%zmm3
     aa4:	62 71 7c 48 28 e1    	vmovaps %zmm1,%zmm12
     aaa:	62 91 7c 48 28 fe    	vmovaps %zmm30,%zmm7
     ab0:	62 12 15 48 7f e1    	vpermt2ps %zmm25,%zmm13,%zmm12
     ab6:	62 b2 15 48 7f fe    	vpermt2ps %zmm22,%zmm13,%zmm7
     abc:	62 f1 fd 48 29 5c 24 	vmovapd %zmm3,0x500(%rsp)
     ac3:	14
     ac4:	62 d3 cd 48 23 f4 e4 	vshuff64x2 $0xe4,%zmm12,%zmm6,%zmm6
     acb:	62 53 c5 48 23 c9 e4 	vshuff64x2 $0xe4,%zmm9,%zmm7,%zmm9
     ad2:	62 91 7c 48 28 ff    	vmovaps %zmm31,%zmm7
     ad8:	62 62 1d 40 7f fa    	vpermt2ps %zmm2,%zmm28,%zmm31
     ade:	62 11 7c 48 28 64 2e 	vmovaps 0x980(%r14,%r13,1),%zmm12
     ae5:	26
     ae6:	62 f2 0d 48 7f fa    	vpermt2ps %zmm2,%zmm14,%zmm7
     aec:	62 f1 7c 48 28 d1    	vmovaps %zmm1,%zmm2
     af2:	62 92 1d 40 7f c9    	vpermt2ps %zmm25,%zmm28,%zmm1
     af8:	62 f1 fd 48 29 74 24 	vmovapd %zmm6,0x440(%rsp)
     aff:	11
     b00:	62 91 7c 48 28 f6    	vmovaps %zmm30,%zmm6
     b06:	62 92 0d 48 7f d1    	vpermt2ps %zmm25,%zmm14,%zmm2
     b0c:	62 22 1d 40 7f f6    	vpermt2ps %zmm22,%zmm28,%zmm30
     b12:	62 01 7c 48 28 4c 2e 	vmovaps 0x880(%r14,%r13,1),%zmm25
     b19:	22
     b1a:	62 b2 0d 48 7f f6    	vpermt2ps %zmm22,%zmm14,%zmm6
     b20:	62 f3 cd 48 23 df e4 	vshuff64x2 $0xe4,%zmm7,%zmm6,%zmm3
     b27:	62 91 5c 40 59 f5    	vmulps %zmm29,%zmm20,%zmm6
     b2d:	62 93 8d 40 23 ff e4 	vshuff64x2 $0xe4,%zmm31,%zmm30,%zmm7
     b34:	62 01 7c 48 28 74 2e 	vmovaps 0x9c0(%r14,%r13,1),%zmm30
     b3b:	27
     b3c:	62 c1 7c 48 28 e4    	vmovaps %zmm12,%zmm20
     b42:	62 01 7c 48 28 7c 2e 	vmovaps 0xa80(%r14,%r13,1),%zmm31
     b49:	2a
     b4a:	62 f1 fd 48 29 5c 24 	vmovapd %zmm3,0x400(%rsp)
     b51:	10
     b52:	62 d1 7c 48 28 da    	vmovaps %zmm10,%zmm3
     b58:	62 31 7c 48 28 d3    	vmovaps %zmm19,%zmm10
     b5e:	62 e2 1d 40 7f dc    	vpermt2ps %zmm4,%zmm28,%zmm19
     b64:	62 f1 fd 48 29 7c 24 	vmovapd %zmm7,0x280(%rsp)
     b6b:	0a
     b6c:	62 72 0d 48 7f d4    	vpermt2ps %zmm4,%zmm14,%zmm10
     b72:	62 91 7c 48 28 64 2e 	vmovaps 0x800(%r14,%r13,1),%zmm4
     b79:	20
     b7a:	62 f2 3d 40 b8 5c 24 	vfmadd231ps 0x200(%rsp),%zmm24,%zmm3
     b81:	08
     b82:	62 01 7c 48 28 44 2e 	vmovaps 0x8c0(%r14,%r13,1),%zmm24
     b89:	23
     b8a:	62 f2 75 40 b8 5c 24 	vfmadd231ps 0xc0(%rsp),%zmm17,%zmm3
     b91:	03
     b92:	62 82 15 48 7f e6    	vpermt2ps %zmm30,%zmm13,%zmm20
     b98:	62 81 7c 48 28 cf    	vmovaps %zmm31,%zmm17
     b9e:	62 f3 e5 40 23 c1 e4 	vshuff64x2 $0xe4,%zmm1,%zmm19,%zmm0
     ba5:	62 91 7c 48 28 4c 2e 	vmovaps 0x840(%r14,%r13,1),%zmm1
     bac:	21
     bad:	62 81 7c 48 28 d9    	vmovaps %zmm25,%zmm19
     bb3:	62 f3 ad 48 23 d2 e4 	vshuff64x2 $0xe4,%zmm2,%zmm10,%zmm2
     bba:	62 11 14 40 59 d5    	vmulps %zmm29,%zmm29,%zmm10
     bc0:	62 01 7c 48 28 6c 2e 	vmovaps 0x900(%r14,%r13,1),%zmm29
     bc7:	24
     bc8:	62 82 6d 40 7f d8    	vpermt2ps %zmm24,%zmm18,%zmm19
     bce:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x480(%rsp)
     bd5:	12
     bd6:	62 f1 7c 48 29 5c 24 	vmovaps %zmm3,0xc0(%rsp)
     bdd:	03
     bde:	62 f1 fd 48 29 54 24 	vmovapd %zmm2,0x200(%rsp)
     be5:	08
     be6:	62 b1 7c 48 28 d7    	vmovaps %zmm23,%zmm2
     bec:	62 e1 2c 48 59 fe    	vmulps %zmm6,%zmm10,%zmm23
     bf2:	62 f1 7c 48 28 f4    	vmovaps %zmm4,%zmm6
     bf8:	62 51 7c 48 28 d4    	vmovaps %zmm12,%zmm10
     bfe:	62 12 6d 40 7f d6    	vpermt2ps %zmm30,%zmm18,%zmm10
     c04:	62 72 45 40 b8 5c 24 	vfmadd231ps 0x340(%rsp),%zmm23,%zmm11
     c0b:	0d
     c0c:	62 f2 7d 40 b8 54 24 	vfmadd231ps 0x240(%rsp),%zmm16,%zmm2
     c13:	09
     c14:	62 e2 45 40 b8 6c 24 	vfmadd231ps 0x140(%rsp),%zmm23,%zmm21
     c1b:	05
     c1c:	62 f2 6d 40 7f f1    	vpermt2ps %zmm1,%zmm18,%zmm6
     c22:	62 f2 45 40 b8 54 24 	vfmadd231ps 0x1c0(%rsp),%zmm23,%zmm2
     c29:	07
     c2a:	62 e1 7c 48 29 6c 24 	vmovaps %zmm21,0x140(%rsp)
     c31:	05
     c32:	62 71 7c 48 29 5c 24 	vmovaps %zmm11,0x240(%rsp)
     c39:	09
     c3a:	62 81 7c 48 28 f5    	vmovaps %zmm29,%zmm22
     c40:	62 91 7c 48 28 fd    	vmovaps %zmm29,%zmm7
     c46:	62 82 6d 40 7f f2    	vpermt2ps %zmm26,%zmm18,%zmm22
     c4c:	62 92 15 48 7f fa    	vpermt2ps %zmm26,%zmm13,%zmm7
     c52:	62 f1 7c 48 29 54 24 	vmovaps %zmm2,0x1c0(%rsp)
     c59:	07
     c5a:	62 b3 cd 48 23 c3 e4 	vshuff64x2 $0xe4,%zmm19,%zmm6,%zmm0
     c61:	62 91 7c 48 28 f1    	vmovaps %zmm25,%zmm6
     c67:	62 92 15 48 7f f0    	vpermt2ps %zmm24,%zmm13,%zmm6
     c6d:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x380(%rsp)
     c74:	0e
     c75:	62 d3 cd 40 23 c2 e4 	vshuff64x2 $0xe4,%zmm10,%zmm22,%zmm0
     c7c:	62 71 7c 48 28 d4    	vmovaps %zmm4,%zmm10
     c82:	62 72 0d 48 7f d1    	vpermt2ps %zmm1,%zmm14,%zmm10
     c88:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x3c0(%rsp)
     c8f:	0f
     c90:	62 f1 7c 48 28 c4    	vmovaps %zmm4,%zmm0
     c96:	62 f2 1d 40 7f e1    	vpermt2ps %zmm1,%zmm28,%zmm4
     c9c:	62 f2 15 48 7f c1    	vpermt2ps %zmm1,%zmm13,%zmm0
     ca2:	62 f3 fd 48 23 c6 e4 	vshuff64x2 $0xe4,%zmm6,%zmm0,%zmm0
     ca9:	62 91 7c 48 28 74 2e 	vmovaps 0xb80(%r14,%r13,1),%zmm6
     cb0:	2e
     cb1:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x2c0(%rsp)
     cb8:	0b
     cb9:	62 b3 c5 48 23 c4 e4 	vshuff64x2 $0xe4,%zmm20,%zmm7,%zmm0
     cc0:	62 f1 7c 48 28 7c 24 	vmovaps 0x6c0(%rsp),%zmm7
     cc7:	1b
     cc8:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x340(%rsp)
     ccf:	0d
     cd0:	62 91 7c 48 28 c1    	vmovaps %zmm25,%zmm0
     cd6:	62 02 1d 40 7f c8    	vpermt2ps %zmm24,%zmm28,%zmm25
     cdc:	62 92 0d 48 7f c0    	vpermt2ps %zmm24,%zmm14,%zmm0
     ce2:	62 41 7c 48 28 c4    	vmovaps %zmm12,%zmm24
     ce8:	62 12 1d 40 7f e6    	vpermt2ps %zmm30,%zmm28,%zmm12
     cee:	62 02 0d 48 7f c6    	vpermt2ps %zmm30,%zmm14,%zmm24
     cf4:	62 01 7c 48 28 74 2e 	vmovaps 0xa40(%r14,%r13,1),%zmm30
     cfb:	29
     cfc:	62 93 dd 48 23 c9 e4 	vshuff64x2 $0xe4,%zmm25,%zmm4,%zmm1
     d03:	62 91 7c 48 28 64 2e 	vmovaps 0xbc0(%r14,%r13,1),%zmm4
     d0a:	2f
     d0b:	62 e3 ad 48 23 f0 e4 	vshuff64x2 $0xe4,%zmm0,%zmm10,%zmm22
     d12:	62 91 7c 48 28 c5    	vmovaps %zmm29,%zmm0
     d18:	62 02 1d 40 7f ea    	vpermt2ps %zmm26,%zmm28,%zmm29
     d1e:	62 01 7c 48 28 64 2e 	vmovaps 0xa00(%r14,%r13,1),%zmm28
     d25:	28
     d26:	62 11 7c 48 28 54 2e 	vmovaps 0xac0(%r14,%r13,1),%zmm10
     d2d:	2b
     d2e:	62 92 0d 48 7f c2    	vpermt2ps %zmm26,%zmm14,%zmm0
     d34:	62 f1 fd 48 29 4c 24 	vmovapd %zmm1,0x300(%rsp)
     d3b:	0c
     d3c:	62 91 7c 48 28 4c 2e 	vmovaps 0xb00(%r14,%r13,1),%zmm1
     d43:	2c
     d44:	62 61 7c 48 28 54 24 	vmovaps 0x780(%rsp),%zmm26
     d4b:	1e
     d4c:	62 d3 95 40 23 dc e4 	vshuff64x2 $0xe4,%zmm12,%zmm29,%zmm3
     d53:	62 71 34 48 5c e7    	vsubps %zmm7,%zmm9,%zmm12
     d59:	62 71 7c 48 28 4c 24 	vmovaps 0x680(%rsp),%zmm9
     d60:	1a
     d61:	62 81 7c 48 28 fc    	vmovaps %zmm28,%zmm23
     d67:	62 c2 6d 40 7f ca    	vpermt2ps %zmm10,%zmm18,%zmm17
     d6d:	62 83 fd 48 23 c0 e4 	vshuff64x2 $0xe4,%zmm24,%zmm0,%zmm16
     d74:	62 01 7c 48 28 c7    	vmovaps %zmm31,%zmm24
     d7a:	62 01 7c 48 28 cc    	vmovaps %zmm28,%zmm25
     d80:	62 82 6d 40 7f fe    	vpermt2ps %zmm30,%zmm18,%zmm23
     d86:	62 42 15 48 7f c2    	vpermt2ps %zmm10,%zmm13,%zmm24
     d8c:	62 02 15 48 7f ce    	vpermt2ps %zmm30,%zmm13,%zmm25
     d92:	62 f1 fd 48 29 5c 24 	vmovapd %zmm3,0x180(%rsp)
     d99:	06
     d9a:	62 f1 7c 48 28 5c 24 	vmovaps 0x4c0(%rsp),%zmm3
     da1:	13
     da2:	62 c1 7c 40 5c c1    	vsubps %zmm9,%zmm16,%zmm16
     da8:	62 b3 c5 40 23 c1 e4 	vshuff64x2 $0xe4,%zmm17,%zmm23,%zmm0
     daf:	62 e1 7c 48 28 ce    	vmovaps %zmm6,%zmm17
     db5:	62 e1 7c 48 28 f9    	vmovaps %zmm1,%zmm23
     dbb:	62 93 b5 40 23 d0 e4 	vshuff64x2 $0xe4,%zmm24,%zmm25,%zmm2
     dc2:	62 e2 6d 40 7f cc    	vpermt2ps %zmm4,%zmm18,%zmm17
     dc8:	62 82 15 48 7f fb    	vpermt2ps %zmm27,%zmm13,%zmm23
     dce:	62 71 64 48 5c dd    	vsubps %zmm5,%zmm3,%zmm11
     dd4:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x540(%rsp)
     ddb:	15
     ddc:	62 f1 7c 48 28 c1    	vmovaps %zmm1,%zmm0
     de2:	62 f1 fd 48 29 54 24 	vmovapd %zmm2,0x8c0(%rsp)
     de9:	23
     dea:	62 f1 7c 48 28 54 24 	vmovaps 0x400(%rsp),%zmm2
     df1:	10
     df2:	62 92 6d 40 7f c3    	vpermt2ps %zmm27,%zmm18,%zmm0
     df8:	62 b3 fd 48 23 c1 e4 	vshuff64x2 $0xe4,%zmm17,%zmm0,%zmm0
     dff:	62 e1 7c 48 28 ce    	vmovaps %zmm6,%zmm17
     e05:	62 c1 6c 48 5c e1    	vsubps %zmm9,%zmm2,%zmm20
     e0b:	62 f1 7c 48 28 54 24 	vmovaps 0x440(%rsp),%zmm2
     e12:	11
     e13:	62 e1 7c 48 29 44 24 	vmovaps %zmm16,0x440(%rsp)
     e1a:	11
     e1b:	62 e2 15 48 7f cc    	vpermt2ps %zmm4,%zmm13,%zmm17
     e21:	62 f1 fd 48 29 44 24 	vmovapd %zmm0,0x880(%rsp)
     e28:	22
     e29:	62 a3 c5 40 23 e9 e4 	vshuff64x2 $0xe4,%zmm17,%zmm23,%zmm21
     e30:	62 81 7c 48 28 cf    	vmovaps %zmm31,%zmm17
     e36:	62 42 05 48 7f fa    	vpermt2ps %zmm10,%zmm15,%zmm31
     e3c:	62 e1 7c 48 28 7c 24 	vmovaps 0x740(%rsp),%zmm23
     e43:	1d
     e44:	62 f1 6c 48 5c d7    	vsubps %zmm7,%zmm2,%zmm2
     e4a:	62 c2 0d 48 7f ca    	vpermt2ps %zmm10,%zmm14,%zmm17
     e50:	62 11 7c 48 28 d4    	vmovaps %zmm28,%zmm10
     e56:	62 f1 7c 48 29 54 24 	vmovaps %zmm2,0x580(%rsp)
     e5d:	16
     e5e:	62 02 05 48 7f e6    	vpermt2ps %zmm30,%zmm15,%zmm28
     e64:	62 12 0d 48 7f d6    	vpermt2ps %zmm30,%zmm14,%zmm10
     e6a:	62 03 9d 40 23 f7 e4 	vshuff64x2 $0xe4,%zmm31,%zmm28,%zmm30
     e71:	62 a3 ad 48 23 d9 e4 	vshuff64x2 $0xe4,%zmm17,%zmm10,%zmm19
     e78:	62 51 7c 48 28 d3    	vmovaps %zmm11,%zmm10
     e7e:	62 52 25 48 a8 d0    	vfmadd213ps %zmm8,%zmm11,%zmm10
     e84:	62 52 1d 48 b8 d4    	vfmadd231ps %zmm12,%zmm12,%zmm10
     e8a:	62 32 5d 40 b8 d4    	vfmadd231ps %zmm20,%zmm20,%zmm10
     e90:	62 c2 7d 48 4e ca    	vrsqrt14ps %zmm10,%zmm17
     e96:	62 31 2c 48 59 d1    	vmulps %zmm17,%zmm10,%zmm10
     e9c:	62 12 75 40 a8 d2    	vfmadd213ps %zmm26,%zmm17,%zmm10
     ea2:	62 a1 74 40 59 cf    	vmulps %zmm23,%zmm17,%zmm17
     ea8:	62 d1 74 40 59 da    	vmulps %zmm10,%zmm17,%zmm3
     eae:	62 71 7c 48 28 54 24 	vmovaps 0x500(%rsp),%zmm10
     eb5:	14
     eb6:	62 61 2c 48 5c cd    	vsubps %zmm5,%zmm10,%zmm25
     ebc:	62 71 7c 48 28 54 24 	vmovaps 0x200(%rsp),%zmm10
     ec3:	08
     ec4:	62 81 7c 48 28 c9    	vmovaps %zmm25,%zmm17
     eca:	62 c2 35 40 a8 c8    	vfmadd213ps %zmm8,%zmm25,%zmm17
     ed0:	62 e2 6d 48 b8 ca    	vfmadd231ps %zmm2,%zmm2,%zmm17
     ed6:	62 d1 2c 48 5c c1    	vsubps %zmm9,%zmm10,%zmm0
     edc:	62 71 7c 48 28 54 24 	vmovaps 0x2c0(%rsp),%zmm10
     ee3:	0b
     ee4:	62 e2 7d 48 b8 c8    	vfmadd231ps %zmm0,%zmm0,%zmm17
     eea:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x5c0(%rsp)
     ef1:	17
     ef2:	62 d1 4c 40 5c c1    	vsubps %zmm9,%zmm22,%zmm0
     ef8:	62 22 7d 48 4e c1    	vrsqrt14ps %zmm17,%zmm24
     efe:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x200(%rsp)
     f05:	08
     f06:	62 81 74 40 59 c8    	vmulps %zmm24,%zmm17,%zmm17
     f0c:	62 82 3d 40 a8 ca    	vfmadd213ps %zmm26,%zmm24,%zmm17
     f12:	62 21 3c 40 59 c7    	vmulps %zmm23,%zmm24,%zmm24
     f18:	62 b1 3c 40 59 d1    	vmulps %zmm17,%zmm24,%zmm2
     f1e:	62 e1 7c 48 28 4c 24 	vmovaps 0x380(%rsp),%zmm17
     f25:	0e
     f26:	62 71 2c 48 5c d7    	vsubps %zmm7,%zmm10,%zmm10
     f2c:	62 71 7c 48 29 54 24 	vmovaps %zmm10,0x380(%rsp)
     f33:	0e
     f34:	62 61 74 40 5c c5    	vsubps %zmm5,%zmm17,%zmm24
     f3a:	62 81 7c 48 28 c8    	vmovaps %zmm24,%zmm17
     f40:	62 c2 3d 40 a8 c8    	vfmadd213ps %zmm8,%zmm24,%zmm17
     f46:	62 c2 2d 48 b8 ca    	vfmadd231ps %zmm10,%zmm10,%zmm17
     f4c:	62 71 7c 48 28 54 24 	vmovaps 0x340(%rsp),%zmm10
     f53:	0d
     f54:	62 e2 7d 48 b8 c8    	vfmadd231ps %zmm0,%zmm0,%zmm17
     f5a:	62 a2 7d 48 4e f1    	vrsqrt14ps %zmm17,%zmm22
     f60:	62 a1 74 40 59 ce    	vmulps %zmm22,%zmm17,%zmm17
     f66:	62 82 4d 40 a8 ca    	vfmadd213ps %zmm26,%zmm22,%zmm17
     f6c:	62 a1 4c 40 59 f7    	vmulps %zmm23,%zmm22,%zmm22
     f72:	62 a1 4c 40 59 f1    	vmulps %zmm17,%zmm22,%zmm22
     f78:	62 e1 7c 48 28 4c 24 	vmovaps 0x3c0(%rsp),%zmm17
     f7f:	0f
     f80:	62 71 2c 48 5c d7    	vsubps %zmm7,%zmm10,%zmm10
     f86:	62 71 7c 48 29 54 24 	vmovaps %zmm10,0x340(%rsp)
     f8d:	0d
     f8e:	62 e1 74 40 5c cd    	vsubps %zmm5,%zmm17,%zmm17
     f94:	62 21 7c 48 28 e9    	vmovaps %zmm17,%zmm29
     f9a:	62 42 75 40 a8 e8    	vfmadd213ps %zmm8,%zmm17,%zmm29
     fa0:	62 42 2d 48 b8 ea    	vfmadd231ps %zmm10,%zmm10,%zmm29
     fa6:	62 71 64 48 59 54 24 	vmulps 0x280(%rsp),%zmm3,%zmm10
     fad:	0a
     fae:	62 f1 64 48 59 db    	vmulps %zmm3,%zmm3,%zmm3
     fb4:	62 22 7d 40 b8 e8    	vfmadd231ps %zmm16,%zmm16,%zmm29
     fba:	62 92 7d 48 4e c5    	vrsqrt14ps %zmm29,%zmm0
     fc0:	62 e1 14 40 59 c0    	vmulps %zmm0,%zmm29,%zmm16
     fc6:	62 61 7c 48 28 e9    	vmovaps %zmm1,%zmm29
     fcc:	62 92 05 48 7f cb    	vpermt2ps %zmm27,%zmm15,%zmm1
     fd2:	62 82 7d 48 a8 c2    	vfmadd213ps %zmm26,%zmm0,%zmm16
     fd8:	62 b1 7c 48 59 c7    	vmulps %zmm23,%zmm0,%zmm0
     fde:	62 d1 64 48 59 da    	vmulps %zmm10,%zmm3,%zmm3
     fe4:	62 02 0d 48 7f eb    	vpermt2ps %zmm27,%zmm14,%zmm29
     fea:	62 a1 7c 48 59 c0    	vmulps %zmm16,%zmm0,%zmm16
     ff0:	62 f1 7c 48 28 c6    	vmovaps %zmm6,%zmm0
     ff6:	62 f2 05 48 7f f4    	vpermt2ps %zmm4,%zmm15,%zmm6
     ffc:	62 f2 0d 48 7f c4    	vpermt2ps %zmm4,%zmm14,%zmm0
    1002:	62 f1 6c 48 59 64 24 	vmulps 0x480(%rsp),%zmm2,%zmm4
    1009:	12
    100a:	62 f1 6c 48 59 d2    	vmulps %zmm2,%zmm2,%zmm2
    1010:	62 61 6c 48 59 e4    	vmulps %zmm4,%zmm2,%zmm28
    1016:	62 f1 4c 40 59 54 24 	vmulps 0x300(%rsp),%zmm22,%zmm2
    101d:	0c
    101e:	62 73 f5 48 23 d6 e4 	vshuff64x2 $0xe4,%zmm6,%zmm1,%zmm10
    1025:	62 b1 4c 40 59 ce    	vmulps %zmm22,%zmm22,%zmm1
    102b:	62 d1 64 40 5c f1    	vsubps %zmm9,%zmm19,%zmm6
    1031:	62 e1 7c 40 59 5c 24 	vmulps 0x180(%rsp),%zmm16,%zmm19
    1038:	06
    1039:	62 a1 7c 40 59 c0    	vmulps %zmm16,%zmm16,%zmm16
    103f:	62 63 95 40 23 e8 e4 	vshuff64x2 $0xe4,%zmm0,%zmm29,%zmm29
    1046:	62 f1 7c 48 28 44 24 	vmovaps 0x880(%rsp),%zmm0
    104d:	22
    104e:	62 f1 7c 48 29 74 24 	vmovaps %zmm6,0x280(%rsp)
    1055:	0a
    1056:	62 e1 74 48 59 f2    	vmulps %zmm2,%zmm1,%zmm22
    105c:	62 f1 7c 48 28 54 24 	vmovaps 0x540(%rsp),%zmm2
    1063:	15
    1064:	62 91 7c 48 28 4c 2e 	vmovaps 0xd80(%r14,%r13,1),%zmm1
    106b:	36
    106c:	62 51 14 40 5c c9    	vsubps %zmm9,%zmm29,%zmm9
    1072:	62 71 7c 48 29 4c 24 	vmovaps %zmm9,0x4c0(%rsp)
    1079:	13
    107a:	62 61 7c 48 5c dd    	vsubps %zmm5,%zmm0,%zmm27
    1080:	62 f1 7c 48 28 c7    	vmovaps %zmm7,%zmm0
    1086:	62 f1 6c 48 5c e5    	vsubps %zmm5,%zmm2,%zmm4
    108c:	62 f1 7c 48 28 54 24 	vmovaps 0x8c0(%rsp),%zmm2
    1093:	23
    1094:	62 91 7c 48 28 eb    	vmovaps %zmm27,%zmm5
    109a:	62 61 7c 48 29 5c 24 	vmovaps %zmm27,0x400(%rsp)
    10a1:	10
    10a2:	62 f1 7c 48 29 64 24 	vmovaps %zmm4,0x3c0(%rsp)
    10a9:	0f
    10aa:	62 d2 5d 48 a8 e0    	vfmadd213ps %zmm8,%zmm4,%zmm4
    10b0:	62 d2 25 40 a8 e8    	vfmadd213ps %zmm8,%zmm27,%zmm5
    10b6:	62 21 7c 40 59 db    	vmulps %zmm19,%zmm16,%zmm27
    10bc:	62 11 7c 48 28 44 2e 	vmovaps 0xcc0(%r14,%r13,1),%zmm8
    10c3:	33
    10c4:	62 81 7c 48 28 44 2e 	vmovaps 0xc40(%r14,%r13,1),%zmm16
    10cb:	31
    10cc:	62 f1 6c 48 5c ff    	vsubps %zmm7,%zmm2,%zmm7
    10d2:	62 f1 54 40 5c d0    	vsubps %zmm0,%zmm21,%zmm2
    10d8:	62 e1 7c 48 28 e8    	vmovaps %zmm0,%zmm21
    10de:	62 f1 7c 48 28 44 24 	vmovaps 0x80(%rsp),%zmm0
    10e5:	02
    10e6:	62 f2 45 48 b8 e7    	vfmadd231ps %zmm7,%zmm7,%zmm4
    10ec:	62 f1 7c 48 29 7c 24 	vmovaps %zmm7,0x480(%rsp)
    10f3:	12
    10f4:	62 f2 6d 48 b8 ea    	vfmadd231ps %zmm2,%zmm2,%zmm5
    10fa:	62 f1 7c 48 29 54 24 	vmovaps %zmm2,0x500(%rsp)
    1101:	14
    1102:	62 91 7c 48 28 54 2e 	vmovaps 0xc00(%r14,%r13,1),%zmm2
    1109:	30
    110a:	62 f2 4d 48 b8 e6    	vfmadd231ps %zmm6,%zmm6,%zmm4
    1110:	62 d2 35 48 b8 e9    	vfmadd231ps %zmm9,%zmm9,%zmm5
    1116:	62 91 7c 48 28 74 2e 	vmovaps 0xdc0(%r14,%r13,1),%zmm6
    111d:	37
    111e:	62 f2 7d 48 4e fc    	vrsqrt14ps %zmm4,%zmm7
    1124:	62 f1 5c 48 59 e7    	vmulps %zmm7,%zmm4,%zmm4
    112a:	62 92 45 48 a8 e2    	vfmadd213ps %zmm26,%zmm7,%zmm4
    1130:	62 b1 44 48 59 ff    	vmulps %zmm23,%zmm7,%zmm7
    1136:	62 f1 44 48 59 e4    	vmulps %zmm4,%zmm7,%zmm4
    113c:	62 f2 7d 48 4e fd    	vrsqrt14ps %zmm5,%zmm7
    1142:	62 f1 54 48 59 ef    	vmulps %zmm7,%zmm5,%zmm5
    1148:	62 d2 65 48 b8 c3    	vfmadd231ps %zmm11,%zmm3,%zmm0
    114e:	62 92 45 48 a8 ea    	vfmadd213ps %zmm26,%zmm7,%zmm5
    1154:	62 b1 44 48 59 ff    	vmulps %zmm23,%zmm7,%zmm7
    115a:	62 71 7c 48 28 d9    	vmovaps %zmm1,%zmm11
    1160:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x80(%rsp)
    1167:	02
    1168:	62 f1 7c 48 28 44 24 	vmovaps 0xc0(%rsp),%zmm0
    116f:	03
    1170:	62 71 44 48 59 cd    	vmulps %zmm5,%zmm7,%zmm9
    1176:	62 91 7c 48 28 6c 2e 	vmovaps 0xc80(%r14,%r13,1),%zmm5
    117d:	32
    117e:	62 f1 0c 40 59 fc    	vmulps %zmm4,%zmm30,%zmm7
    1184:	62 61 7c 48 28 ea    	vmovaps %zmm2,%zmm29
    118a:	62 f1 5c 48 59 e4    	vmulps %zmm4,%zmm4,%zmm4
    1190:	62 61 7c 48 28 f2    	vmovaps %zmm2,%zmm30
    1196:	62 61 5c 48 59 ff    	vmulps %zmm7,%zmm4,%zmm31
    119c:	62 91 7c 48 28 64 2e 	vmovaps 0xd40(%r14,%r13,1),%zmm4
    11a3:	35
    11a4:	62 22 15 48 7f e8    	vpermt2ps %zmm16,%zmm13,%zmm29
    11aa:	62 22 6d 40 7f f0    	vpermt2ps %zmm16,%zmm18,%zmm30
    11b0:	62 72 15 48 7f de    	vpermt2ps %zmm6,%zmm13,%zmm11
    11b6:	62 51 2c 48 59 d1    	vmulps %zmm9,%zmm10,%zmm10
    11bc:	62 51 34 48 59 c9    	vmulps %zmm9,%zmm9,%zmm9
    11c2:	62 f1 7c 48 28 f9    	vmovaps %zmm1,%zmm7
    11c8:	62 f2 6d 40 7f fe    	vpermt2ps %zmm6,%zmm18,%zmm7
    11ce:	62 d2 65 48 b8 c4    	vfmadd231ps %zmm12,%zmm3,%zmm0
    11d4:	62 71 7c 48 28 64 24 	vmovaps 0x100(%rsp),%zmm12
    11db:	04
    11dc:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0xc0(%rsp)
    11e3:	03
    11e4:	62 91 7c 48 28 44 2e 	vmovaps 0xd00(%r14,%r13,1),%zmm0
    11eb:	34
    11ec:	62 32 65 48 b8 e4    	vfmadd231ps %zmm20,%zmm3,%zmm12
    11f2:	62 e1 7c 48 28 e5    	vmovaps %zmm5,%zmm20
    11f8:	62 f1 7c 48 28 dd    	vmovaps %zmm5,%zmm3
    11fe:	62 c2 15 48 7f e0    	vpermt2ps %zmm8,%zmm13,%zmm20
    1204:	62 d2 6d 40 7f d8    	vpermt2ps %zmm8,%zmm18,%zmm3
    120a:	62 e1 7c 48 28 d8    	vmovaps %zmm0,%zmm19
    1210:	62 72 4d 40 b8 64 24 	vfmadd231ps 0x200(%rsp),%zmm22,%zmm12
    1217:	08
    1218:	62 e2 6d 40 7f dc    	vpermt2ps %zmm4,%zmm18,%zmm19
    121e:	62 72 05 40 b8 64 24 	vfmadd231ps 0x280(%rsp),%zmm31,%zmm12
    1225:	0a
    1226:	62 a3 95 40 23 e4 e4 	vshuff64x2 $0xe4,%zmm20,%zmm29,%zmm20
    122d:	62 61 7c 48 28 e8    	vmovaps %zmm0,%zmm29
    1233:	62 63 8d 40 23 f3 e4 	vshuff64x2 $0xe4,%zmm3,%zmm30,%zmm30
    123a:	62 d1 34 48 59 da    	vmulps %zmm10,%zmm9,%zmm3
    1240:	62 71 7c 48 28 d2    	vmovaps %zmm2,%zmm10
    1246:	62 b2 05 48 7f d0    	vpermt2ps %zmm16,%zmm15,%zmm2
    124c:	62 62 15 48 7f ec    	vpermt2ps %zmm4,%zmm13,%zmm29
    1252:	62 f1 7c 48 29 5c 24 	vmovaps %zmm3,0x540(%rsp)
    1259:	15
    125a:	62 f1 7c 48 28 dd    	vmovaps %zmm5,%zmm3
    1260:	62 32 0d 48 7f d0    	vpermt2ps %zmm16,%zmm14,%zmm10
    1266:	62 e1 7c 48 28 44 24 	vmovaps 0x700(%rsp),%zmm16
    126d:	1c
    126e:	62 d2 05 48 7f e8    	vpermt2ps %zmm8,%zmm15,%zmm5
    1274:	62 f3 e5 40 23 ff e4 	vshuff64x2 $0xe4,%zmm7,%zmm19,%zmm7
    127b:	62 71 7c 48 29 64 24 	vmovaps %zmm12,0x100(%rsp)
    1282:	04
    1283:	62 d2 0d 48 7f d8    	vpermt2ps %zmm8,%zmm14,%zmm3
    1289:	62 71 7c 48 28 c1    	vmovaps %zmm1,%zmm8
    128f:	62 f2 05 48 7f ce    	vpermt2ps %zmm6,%zmm15,%zmm1
    1295:	62 72 0d 48 7f c6    	vpermt2ps %zmm6,%zmm14,%zmm8
    129b:	62 43 95 40 23 eb e4 	vshuff64x2 $0xe4,%zmm11,%zmm29,%zmm29
    12a2:	62 71 7c 48 28 d8    	vmovaps %zmm0,%zmm11
    12a8:	62 f2 05 48 7f c4    	vpermt2ps %zmm4,%zmm15,%zmm0
    12ae:	62 f3 ed 48 23 ed e4 	vshuff64x2 $0xe4,%zmm5,%zmm2,%zmm5
    12b5:	62 b1 7c 48 28 d5    	vmovaps %zmm21,%zmm2
    12bb:	62 73 ad 48 23 cb e4 	vshuff64x2 $0xe4,%zmm3,%zmm10,%zmm9
    12c2:	62 f1 7c 48 28 5c 24 	vmovaps 0x240(%rsp),%zmm3
    12c9:	09
    12ca:	62 b1 0c 40 5c f0    	vsubps %zmm16,%zmm30,%zmm6
    12d0:	62 21 7c 48 28 f0    	vmovaps %zmm16,%zmm30
    12d6:	62 72 0d 48 7f dc    	vpermt2ps %zmm4,%zmm14,%zmm11
    12dc:	62 f1 7c 48 28 64 24 	vmovaps 0x80(%rsp),%zmm4
    12e3:	02
    12e4:	62 71 14 40 5c d2    	vsubps %zmm2,%zmm29,%zmm10
    12ea:	62 71 7c 48 29 54 24 	vmovaps %zmm10,0x180(%rsp)
    12f1:	06
    12f2:	62 e3 fd 48 23 d9 e4 	vshuff64x2 $0xe4,%zmm1,%zmm0,%zmm19
    12f9:	62 b1 44 48 5c c0    	vsubps %zmm16,%zmm7,%zmm0
    12ff:	62 e1 7c 48 28 44 24 	vmovaps 0x600(%rsp),%zmm16
    1306:	18
    1307:	62 53 a5 48 23 c0 e4 	vshuff64x2 $0xe4,%zmm8,%zmm11,%zmm8
    130e:	62 71 7c 48 28 5c 24 	vmovaps 0x140(%rsp),%zmm11
    1315:	05
    1316:	62 f1 7c 48 29 44 24 	vmovaps %zmm0,0x2c0(%rsp)
    131d:	0b
    131e:	62 92 1d 40 b8 d9    	vfmadd231ps %zmm25,%zmm28,%zmm3
    1324:	62 21 5c 40 5c cd    	vsubps %zmm21,%zmm20,%zmm25
    132a:	62 e1 7c 48 28 64 24 	vmovaps 0x680(%rsp),%zmm20
    1331:	1a
    1332:	62 92 4d 40 b8 e0    	vfmadd231ps %zmm24,%zmm22,%zmm4
    1338:	62 e1 7c 48 28 6c 24 	vmovaps 0x1c0(%rsp),%zmm21
    133f:	07
    1340:	62 01 7c 48 28 44 2e 	vmovaps 0xfc0(%r14,%r13,1),%zmm24
    1347:	3f
    1348:	62 b2 25 40 b8 d9    	vfmadd231ps %zmm17,%zmm27,%zmm3
    134e:	62 61 7c 48 29 4c 24 	vmovaps %zmm25,0x300(%rsp)
    1355:	0c
    1356:	62 f2 05 40 b8 64 24 	vfmadd231ps 0x3c0(%rsp),%zmm31,%zmm4
    135d:	0f
    135e:	62 f1 7c 48 29 64 24 	vmovaps %zmm4,0x80(%rsp)
    1365:	02
    1366:	62 b2 7d 48 a8 c0    	vfmadd213ps %zmm16,%zmm0,%zmm0
    136c:	62 d2 2d 48 b8 c2    	vfmadd231ps %zmm10,%zmm10,%zmm0
    1372:	62 71 7c 48 28 d3    	vmovaps %zmm3,%zmm10
    1378:	62 f1 7c 48 28 5c 24 	vmovaps 0xc0(%rsp),%zmm3
    137f:	03
    1380:	62 72 1d 40 b8 5c 24 	vfmadd231ps 0x580(%rsp),%zmm28,%zmm11
    1387:	16
    1388:	62 f1 7c 48 29 74 24 	vmovaps %zmm6,0x580(%rsp)
    138f:	16
    1390:	62 b2 4d 48 a8 f0    	vfmadd213ps %zmm16,%zmm6,%zmm6
    1396:	62 b1 34 48 5c fc    	vsubps %zmm20,%zmm9,%zmm7
    139c:	62 92 35 40 b8 f1    	vfmadd231ps %zmm25,%zmm25,%zmm6
    13a2:	62 01 7c 48 28 4c 2e 	vmovaps 0xf80(%r14,%r13,1),%zmm25
    13a9:	3e
    13aa:	62 31 3c 48 5c c4    	vsubps %zmm20,%zmm8,%zmm8
    13b0:	62 11 7c 48 28 4c 2e 	vmovaps 0xf40(%r14,%r13,1),%zmm9
    13b7:	3d
    13b8:	62 e2 1d 40 b8 6c 24 	vfmadd231ps 0x5c0(%rsp),%zmm28,%zmm21
    13bf:	17
    13c0:	62 f2 45 48 b8 f7    	vfmadd231ps %zmm7,%zmm7,%zmm6
    13c6:	62 f1 7c 48 29 7c 24 	vmovaps %zmm7,0x240(%rsp)
    13cd:	09
    13ce:	62 d2 3d 48 b8 c0    	vfmadd231ps %zmm8,%zmm8,%zmm0
    13d4:	62 71 7c 48 29 44 24 	vmovaps %zmm8,0x1c0(%rsp)
    13db:	07
    13dc:	62 f2 7d 48 4e fe    	vrsqrt14ps %zmm6,%zmm7
    13e2:	62 f2 7d 48 4e d0    	vrsqrt14ps %zmm0,%zmm2
    13e8:	62 f1 4c 48 59 cf    	vmulps %zmm7,%zmm6,%zmm1
    13ee:	62 f1 7c 48 59 f2    	vmulps %zmm2,%zmm0,%zmm6
    13f4:	62 72 25 40 b8 5c 24 	vfmadd231ps 0x340(%rsp),%zmm27,%zmm11
    13fb:	0d
    13fc:	62 92 45 48 a8 ca    	vfmadd213ps %zmm26,%zmm7,%zmm1
    1402:	62 b1 44 48 59 ff    	vmulps %zmm23,%zmm7,%zmm7
    1408:	62 92 6d 48 a8 f2    	vfmadd213ps %zmm26,%zmm2,%zmm6
    140e:	62 e2 25 40 b8 6c 24 	vfmadd231ps 0x440(%rsp),%zmm27,%zmm21
    1415:	11
    1416:	62 f1 44 48 59 c9    	vmulps %zmm1,%zmm7,%zmm1
    141c:	62 b1 6c 48 59 ff    	vmulps %zmm23,%zmm2,%zmm7
    1422:	62 f1 44 48 59 f6    	vmulps %zmm6,%zmm7,%zmm6
    1428:	62 f1 74 48 59 c1    	vmulps %zmm1,%zmm1,%zmm0
    142e:	62 f1 54 48 59 c9    	vmulps %zmm1,%zmm5,%zmm1
    1434:	62 91 7c 48 28 7c 2e 	vmovaps 0xe80(%r14,%r13,1),%zmm7
    143b:	3a
    143c:	62 91 7c 48 28 6c 2e 	vmovaps 0xe00(%r14,%r13,1),%zmm5
    1443:	38
    1444:	62 f2 4d 40 b8 5c 24 	vfmadd231ps 0x380(%rsp),%zmm22,%zmm3
    144b:	0e
    144c:	62 81 7c 48 28 74 2e 	vmovaps 0xf00(%r14,%r13,1),%zmm22
    1453:	3c
    1454:	62 e1 7c 48 59 c9    	vmulps %zmm1,%zmm0,%zmm17
    145a:	62 f1 64 40 59 c6    	vmulps %zmm6,%zmm19,%zmm0
    1460:	62 71 4c 48 59 c6    	vmulps %zmm6,%zmm6,%zmm8
    1466:	62 91 7c 48 28 4c 2e 	vmovaps 0xe40(%r14,%r13,1),%zmm1
    146d:	39
    146e:	62 91 7c 48 28 74 2e 	vmovaps 0xec0(%r14,%r13,1),%zmm6
    1475:	3b
    1476:	62 71 7c 48 29 5c 24 	vmovaps %zmm11,0x140(%rsp)
    147d:	05
    147e:	62 f1 3c 48 59 d0    	vmulps %zmm0,%zmm8,%zmm2
    1484:	62 11 7c 48 28 c1    	vmovaps %zmm25,%zmm8
    148a:	62 12 6d 40 7f c0    	vpermt2ps %zmm24,%zmm18,%zmm8
    1490:	62 f2 05 40 b8 5c 24 	vfmadd231ps 0x480(%rsp),%zmm31,%zmm3
    1497:	12
    1498:	62 61 7c 48 28 df    	vmovaps %zmm7,%zmm27
    149e:	62 f1 7c 48 28 c5    	vmovaps %zmm5,%zmm0
    14a4:	62 71 7c 48 28 df    	vmovaps %zmm7,%zmm11
    14aa:	62 71 7c 48 28 e7    	vmovaps %zmm7,%zmm12
    14b0:	62 e1 7c 48 28 dd    	vmovaps %zmm5,%zmm19
    14b6:	62 21 7c 48 28 e6    	vmovaps %zmm22,%zmm28
    14bc:	62 21 7c 48 28 ee    	vmovaps %zmm22,%zmm29
    14c2:	62 62 6d 40 7f de    	vpermt2ps %zmm6,%zmm18,%zmm27
    14c8:	62 f2 6d 40 7f c1    	vpermt2ps %zmm1,%zmm18,%zmm0
    14ce:	62 72 15 48 7f de    	vpermt2ps %zmm6,%zmm13,%zmm11
    14d4:	62 72 0d 48 7f e6    	vpermt2ps %zmm6,%zmm14,%zmm12
    14da:	62 e2 0d 48 7f d9    	vpermt2ps %zmm1,%zmm14,%zmm19
    14e0:	62 f2 05 48 7f fe    	vpermt2ps %zmm6,%zmm15,%zmm7
    14e6:	62 42 6d 40 7f e1    	vpermt2ps %zmm9,%zmm18,%zmm28
    14ec:	62 81 7c 48 28 d1    	vmovaps %zmm25,%zmm18
    14f2:	62 42 15 48 7f e9    	vpermt2ps %zmm9,%zmm13,%zmm29
    14f8:	62 82 15 48 7f d0    	vpermt2ps %zmm24,%zmm13,%zmm18
    14fe:	62 93 fd 48 23 e3 e4 	vshuff64x2 $0xe4,%zmm27,%zmm0,%zmm4
    1505:	62 f1 7c 48 28 44 24 	vmovaps 0x540(%rsp),%zmm0
    150c:	15
    150d:	62 53 e5 40 23 e4 e4 	vshuff64x2 $0xe4,%zmm12,%zmm19,%zmm12
    1514:	62 81 7c 48 28 d9    	vmovaps %zmm25,%zmm19
    151a:	62 02 05 48 7f c8    	vpermt2ps %zmm24,%zmm15,%zmm25
    1520:	62 53 9d 40 23 c0 e4 	vshuff64x2 $0xe4,%zmm8,%zmm28,%zmm8
    1527:	62 61 7c 48 28 e5    	vmovaps %zmm5,%zmm28
    152d:	62 f2 05 48 7f e9    	vpermt2ps %zmm1,%zmm15,%zmm5
    1533:	62 82 0d 48 7f d8    	vpermt2ps %zmm24,%zmm14,%zmm19
    1539:	62 62 15 48 7f e1    	vpermt2ps %zmm1,%zmm13,%zmm28
    153f:	62 72 7d 48 b8 54 24 	vfmadd231ps 0x400(%rsp),%zmm0,%zmm10
    1546:	10
    1547:	62 e2 7d 48 b8 6c 24 	vfmadd231ps 0x4c0(%rsp),%zmm0,%zmm21
    154e:	13
    154f:	62 f3 d5 48 23 cf e4 	vshuff64x2 $0xe4,%zmm7,%zmm5,%zmm1
    1556:	62 91 3c 48 5c ee    	vsubps %zmm30,%zmm8,%zmm5
    155c:	62 31 1c 48 5c c4    	vsubps %zmm20,%zmm12,%zmm8
    1562:	62 43 9d 40 23 db e4 	vshuff64x2 $0xe4,%zmm11,%zmm28,%zmm27
    1569:	62 23 95 40 23 e2 e4 	vshuff64x2 $0xe4,%zmm18,%zmm29,%zmm28
    1570:	62 e1 7c 48 28 54 24 	vmovaps 0x140(%rsp),%zmm18
    1577:	05
    1578:	62 71 7c 48 28 db    	vmovaps %zmm3,%zmm11
    157e:	62 b1 7c 48 28 de    	vmovaps %zmm22,%zmm3
    1584:	62 c2 05 48 7f f1    	vpermt2ps %zmm9,%zmm15,%zmm22
    158a:	62 d2 0d 48 7f d9    	vpermt2ps %zmm9,%zmm14,%zmm3
    1590:	62 71 7c 48 28 74 24 	vmovaps 0x80(%rsp),%zmm14
    1597:	02
    1598:	62 51 7c 48 28 eb    	vmovaps %zmm11,%zmm13
    159e:	62 71 7c 48 28 5c 24 	vmovaps 0x100(%rsp),%zmm11
    15a5:	04
    15a6:	62 72 75 40 b8 6c 24 	vfmadd231ps 0x300(%rsp),%zmm17,%zmm13
    15ad:	0c
    15ae:	62 72 6d 48 b8 54 24 	vfmadd231ps 0x2c0(%rsp),%zmm2,%zmm10
    15b5:	0b
    15b6:	62 e2 6d 48 b8 6c 24 	vfmadd231ps 0x1c0(%rsp),%zmm2,%zmm21
    15bd:	07
    15be:	62 e2 7d 48 b8 54 24 	vfmadd231ps 0x500(%rsp),%zmm0,%zmm18
    15c5:	14
    15c6:	62 91 5c 48 5c c6    	vsubps %zmm30,%zmm4,%zmm0
    15cc:	62 f1 7c 48 28 64 24 	vmovaps 0x6c0(%rsp),%zmm4
    15d3:	1b
    15d4:	62 a3 e5 48 23 db e4 	vshuff64x2 $0xe4,%zmm19,%zmm3,%zmm19
    15db:	62 93 cd 40 23 d9 e4 	vshuff64x2 $0xe4,%zmm25,%zmm22,%zmm3
    15e2:	62 71 7c 48 28 e0    	vmovaps %zmm0,%zmm12
    15e8:	62 72 75 40 b8 74 24 	vfmadd231ps 0x580(%rsp),%zmm17,%zmm14
    15ef:	16
    15f0:	62 72 75 40 b8 5c 24 	vfmadd231ps 0x240(%rsp),%zmm17,%zmm11
    15f7:	09
    15f8:	62 32 7d 48 a8 e0    	vfmadd213ps %zmm16,%zmm0,%zmm12
    15fe:	62 31 64 40 5c cc    	vsubps %zmm20,%zmm19,%zmm9
    1604:	62 e2 6d 48 b8 54 24 	vfmadd231ps 0x180(%rsp),%zmm2,%zmm18
    160b:	06
    160c:	62 f1 24 40 5c f4    	vsubps %zmm4,%zmm27,%zmm6
    1612:	62 f1 1c 40 5c fc    	vsubps %zmm4,%zmm28,%zmm7
    1618:	62 f1 7c 48 28 e5    	vmovaps %zmm5,%zmm4
    161e:	62 b2 55 48 a8 e0    	vfmadd213ps %zmm16,%zmm5,%zmm4
    1624:	62 72 4d 48 b8 e6    	vfmadd231ps %zmm6,%zmm6,%zmm12
    162a:	62 f2 45 48 b8 e7    	vfmadd231ps %zmm7,%zmm7,%zmm4
    1630:	62 52 3d 48 b8 e0    	vfmadd231ps %zmm8,%zmm8,%zmm12
    1636:	62 d2 35 48 b8 e1    	vfmadd231ps %zmm9,%zmm9,%zmm4
    163c:	62 c2 7d 48 4e e4    	vrsqrt14ps %zmm12,%zmm20
    1642:	62 e2 7d 48 4e c4    	vrsqrt14ps %zmm4,%zmm16
    1648:	62 31 1c 48 59 e4    	vmulps %zmm20,%zmm12,%zmm12
    164e:	62 a1 5c 48 59 d8    	vmulps %zmm16,%zmm4,%zmm19
    1654:	62 12 5d 40 a8 e2    	vfmadd213ps %zmm26,%zmm20,%zmm12
    165a:	62 a1 5c 40 59 e7    	vmulps %zmm23,%zmm20,%zmm20
    1660:	62 b1 7c 40 59 e7    	vmulps %zmm23,%zmm16,%zmm4
    1666:	62 82 7d 40 a8 da    	vfmadd213ps %zmm26,%zmm16,%zmm19
    166c:	62 51 5c 40 59 e4    	vmulps %zmm12,%zmm20,%zmm12
    1672:	62 b1 5c 48 59 e3    	vmulps %zmm19,%zmm4,%zmm4
    1678:	62 c1 1c 48 59 c4    	vmulps %zmm12,%zmm12,%zmm16
    167e:	62 d1 74 48 59 cc    	vmulps %zmm12,%zmm1,%zmm1
    1684:	62 f1 64 48 59 d4    	vmulps %zmm4,%zmm3,%zmm2
    168a:	62 e1 5c 48 59 cc    	vmulps %zmm4,%zmm4,%zmm17
    1690:	62 f1 7c 40 59 c9    	vmulps %zmm1,%zmm16,%zmm1
    1696:	62 d1 7c 48 28 de    	vmovaps %zmm14,%zmm3
    169c:	62 b1 7c 48 28 e2    	vmovaps %zmm18,%zmm4
    16a2:	62 f1 74 40 59 d2    	vmulps %zmm2,%zmm17,%zmm2
    16a8:	62 f2 75 48 b8 d8    	vfmadd231ps %zmm0,%zmm1,%zmm3
    16ae:	62 72 6d 48 b8 d5    	vfmadd231ps %zmm5,%zmm2,%zmm10
    16b4:	62 f2 6d 48 b8 e7    	vfmadd231ps %zmm7,%zmm2,%zmm4
    16ba:	62 c2 6d 48 b8 e9    	vfmadd231ps %zmm9,%zmm2,%zmm21
    16c0:	62 f1 2c 48 58 c3    	vaddps %zmm3,%zmm10,%zmm0
    16c6:	62 d1 7c 48 28 dd    	vmovaps %zmm13,%zmm3
    16cc:	62 f3 fd 48 1b c2 01 	vextractf64x4 $0x1,%zmm0,%ymm2
    16d3:	62 f2 75 48 b8 de    	vfmadd231ps %zmm6,%zmm1,%zmm3
    16d9:	62 f1 5c 48 58 db    	vaddps %zmm3,%zmm4,%zmm3
    16df:	62 d1 7c 48 28 e3    	vmovaps %zmm11,%zmm4
    16e5:	62 f1 7c 48 58 c2    	vaddps %zmm2,%zmm0,%zmm0
    16eb:	62 d2 75 48 b8 e0    	vfmadd231ps %zmm8,%zmm1,%zmm4
    16f1:	c4 e3 7d 19 c2 01    	vextractf128 $0x1,%ymm0,%xmm2
    16f7:	62 f1 54 40 58 cc    	vaddps %zmm4,%zmm21,%zmm1
    16fd:	62 f3 fd 48 1b dc 01 	vextractf64x4 $0x1,%zmm3,%ymm4
    1704:	c5 f8 58 c2          	vaddps %xmm2,%xmm0,%xmm0
    1708:	62 f1 64 48 58 dc    	vaddps %zmm4,%zmm3,%zmm3
    170e:	c4 e3 7d 19 dc 01    	vextractf128 $0x1,%ymm3,%xmm4
    1714:	c4 e3 79 05 d0 01    	vpermilpd $0x1,%xmm0,%xmm2
    171a:	c5 e0 58 dc          	vaddps %xmm4,%xmm3,%xmm3
    171e:	c5 f8 58 c2          	vaddps %xmm2,%xmm0,%xmm0
    1722:	c5 fa 16 d0          	vmovshdup %xmm0,%xmm2
    1726:	c5 fa 58 c2          	vaddss %xmm2,%xmm0,%xmm0
    172a:	c5 f8 29 84 24 00 01 	vmovaps %xmm0,0x100(%rsp)
    1731:	00 00
    1733:	c4 e3 79 05 c3 01    	vpermilpd $0x1,%xmm3,%xmm0
    1739:	c5 e0 58 c0          	vaddps %xmm0,%xmm3,%xmm0
    173d:	62 f3 fd 48 1b cb 01 	vextractf64x4 $0x1,%zmm1,%ymm3
    1744:	62 f1 74 48 58 cb    	vaddps %zmm3,%zmm1,%zmm1
    174a:	c5 fa 16 d0          	vmovshdup %xmm0,%xmm2
    174e:	c5 fa 58 c2          	vaddss %xmm2,%xmm0,%xmm0
    1752:	c5 f8 29 84 24 c0 00 	vmovaps %xmm0,0xc0(%rsp)
    1759:	00 00
    175b:	c4 e3 7d 19 c8 01    	vextractf128 $0x1,%ymm1,%xmm0
    1761:	c5 f0 58 c0          	vaddps %xmm0,%xmm1,%xmm0
    1765:	c4 e3 79 05 c8 01    	vpermilpd $0x1,%xmm0,%xmm1
    176b:	c5 f8 58 c1          	vaddps %xmm1,%xmm0,%xmm0
    176f:	c5 fa 16 c8          	vmovshdup %xmm0,%xmm1
    1773:	c5 fa 58 c1          	vaddss %xmm1,%xmm0,%xmm0
    1777:	c5 f8 29 84 24 80 00 	vmovaps %xmm0,0x80(%rsp)
    177e:	00 00
    1780:	c5 f8 77             	vzeroupper
    1783:	41 ff d4             	call   *%r12
    1786:	41 81 c7 00 01 00 00 	add    $0x100,%r15d
    178d:	41 39 df             	cmp    %ebx,%r15d
    1790:	0f 82 5a ea ff ff    	jb     1f0 <simplified_nbody+0x1f0>
    1796:	eb 27                	jmp    17bf <simplified_nbody+0x17bf>
    1798:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
    179c:	c5 f8 29 84 24 00 01 	vmovaps %xmm0,0x100(%rsp)
    17a3:	00 00
    17a5:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
    17a9:	c5 f8 29 84 24 c0 00 	vmovaps %xmm0,0xc0(%rsp)
    17b0:	00 00
    17b2:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
    17b6:	c5 f8 29 84 24 80 00 	vmovaps %xmm0,0x80(%rsp)
    17bd:	00 00
    17bf:	48 8b 44 24 50       	mov    0x50(%rsp),%rax
    17c4:	c5 f8 28 94 24 00 01 	vmovaps 0x100(%rsp),%xmm2
    17cb:	00 00
    17cd:	c5 f8 28 9c 24 c0 00 	vmovaps 0xc0(%rsp),%xmm3
    17d4:	00 00
    17d6:	c5 f8 28 a4 24 80 00 	vmovaps 0x80(%rsp),%xmm4
    17dd:	00 00
    17df:	48 8b 4c 24 58       	mov    0x58(%rsp),%rcx
    17e4:	c5 fa 10 00          	vmovss (%rax),%xmm0
    17e8:	48 b8 00 00 00 00 00 	movabs $0x0,%rax
    17ef:	00 00 00
    17f2:	c4 c1 7a 10 0c 06    	vmovss (%r14,%rax,1),%xmm1
    17f8:	48 8b 44 24 68       	mov    0x68(%rsp),%rax
    17fd:	c4 e2 79 a9 54 24 3c 	vfmadd213ss 0x3c(%rsp),%xmm0,%xmm2
    1804:	c4 e2 79 a9 5c 24 40 	vfmadd213ss 0x40(%rsp),%xmm0,%xmm3
    180b:	c4 e2 79 a9 64 24 44 	vfmadd213ss 0x44(%rsp),%xmm0,%xmm4
    1812:	c5 ea 59 d1          	vmulss %xmm1,%xmm2,%xmm2
    1816:	c5 e2 59 d9          	vmulss %xmm1,%xmm3,%xmm3
    181a:	c5 da 59 c9          	vmulss %xmm1,%xmm4,%xmm1
    181e:	c5 fa 10 24 08       	vmovss (%rax,%rcx,1),%xmm4
    1823:	c4 e2 69 b9 e0       	vfmadd231ss %xmm0,%xmm2,%xmm4
    1828:	c5 fa 11 24 08       	vmovss %xmm4,(%rax,%rcx,1)
    182d:	c5 fa 10 64 08 04    	vmovss 0x4(%rax,%rcx,1),%xmm4
    1833:	c4 e2 61 b9 e0       	vfmadd231ss %xmm0,%xmm3,%xmm4
    1838:	c5 fa 11 64 08 04    	vmovss %xmm4,0x4(%rax,%rcx,1)
    183e:	c4 e2 71 a9 44 08 08 	vfmadd213ss 0x8(%rax,%rcx,1),%xmm1,%xmm0
    1845:	c5 fa 11 44 08 08    	vmovss %xmm0,0x8(%rax,%rcx,1)
    184b:	48 8b 4c 24 60       	mov    0x60(%rsp),%rcx
    1850:	c5 fa 11 11          	vmovss %xmm2,(%rcx)
    1854:	c5 fa 11 59 04       	vmovss %xmm3,0x4(%rcx)
    1859:	c5 fa 11 49 08       	vmovss %xmm1,0x8(%rcx)
    185e:	48 8d 65 d8          	lea    -0x28(%rbp),%rsp
    1862:	5b                   	pop    %rbx
    1863:	41 5c                	pop    %r12
    1865:	41 5d                	pop    %r13
    1867:	41 5e                	pop    %r14
    1869:	41 5f                	pop    %r15
    186b:	5d                   	pop    %rbp
    186c:	c3                   	ret

Host-Compute (ARM CPU)

nbody_aarch64.bin:	file format elf64-littleaarch64


Disassembly of section .text:

0000000000000000 <simplified_nbody>:
   0:	d104c3ff 	sub	sp, sp, #0x130
   4:	90000008 	adrp	x8, 0 <floor_global_idx>
   8:	6d0a33ed 	stp	d13, d12, [sp, #160]
   c:	6d0b2beb 	stp	d11, d10, [sp, #176]
  10:	6d0c23e9 	stp	d9, d8, [sp, #192]
  14:	a90d7bfd 	stp	x29, x30, [sp, #208]
  18:	910343fd 	add	x29, sp, #0xd0
  1c:	a90e6ffc 	stp	x28, x27, [sp, #224]
  20:	a90f67fa 	stp	x26, x25, [sp, #240]
  24:	a9105ff8 	stp	x24, x23, [sp, #256]
  28:	a91157f6 	stp	x22, x21, [sp, #272]
  2c:	a9124ff4 	stp	x20, x19, [sp, #288]
  30:	f9400108 	ldr	x8, [x8]
  34:	b9400117 	ldr	w23, [x8]
  38:	52800188 	mov	w8, #0xc                   	// #12
  3c:	9b080af6 	madd	x22, x23, x8, x2
  40:	90000008 	adrp	x8, 0 <floor_global_work_size>
  44:	aa1603f8 	mov	x24, x22
  48:	f9400108 	ldr	x8, [x8]
  4c:	fd4002c8 	ldr	d8, [x22]
  50:	bc408f09 	ldr	s9, [x24, #8]!
  54:	b9400119 	ldr	w25, [x8]
  58:	34000d79 	cbz	w25, 204 <simplified_nbody+0x204>
  5c:	2f00e403 	movi	d3, #0x0
  60:	8b171008 	add	x8, x0, x23, lsl #4
  64:	9000001c 	adrp	x28, 0 <floor_local_idx>
  68:	a90007e3 	stp	x3, x1, [sp]
  6c:	90000013 	adrp	x19, 0 <simplified_nbody>
  70:	90000014 	adrp	x20, 0 <host_compute_device_barrier>
  74:	aa0003f5 	mov	x21, x0
  78:	2a1f03fa 	mov	w26, wzr
  7c:	f940039c 	ldr	x28, [x28]
  80:	3c9a03a3 	stur	q3, [x29, #-96]
  84:	2d400500 	ldp	s0, s1, [x8]
  88:	bd400902 	ldr	s2, [x8, #8]
  8c:	5296e2e8 	mov	w8, #0xb717                	// #46871
  90:	4f03f603 	fmov	v3.4s, #1.000000000000000000e+00
  94:	72a71a28 	movk	w8, #0x38d1, lsl #16
  98:	2a1f03fb 	mov	w27, wzr
  9c:	3d8017e3 	str	q3, [sp, #80]
  a0:	4e040403 	dup	v3.4s, v0.s[0]
  a4:	4e040d00 	dup	v0.4s, w8
  a8:	f9400273 	ldr	x19, [x19]
  ac:	ad018fe0 	stp	q0, q3, [sp, #48]
  b0:	2f00e400 	movi	d0, #0x0
  b4:	4e040423 	dup	v3.4s, v1.s[0]
  b8:	3d801be0 	str	q0, [sp, #96]
  bc:	2f00e400 	movi	d0, #0x0
  c0:	3c9b03a0 	stur	q0, [x29, #-80]
  c4:	4e040440 	dup	v0.4s, v2.s[0]
  c8:	f9400294 	ldr	x20, [x20]
  cc:	ad008fe0 	stp	q0, q3, [sp, #16]
  d0:	b9400388 	ldr	w8, [x28]
  d4:	0b1b2109 	add	w9, w8, w27, lsl #8
  d8:	3ce95aa0 	ldr	q0, [x21, w9, uxtw #4]
  dc:	3ca87a60 	str	q0, [x19, x8, lsl #4]
  e0:	d63f0280 	blr	x20
  e4:	6f00e400 	movi	v0.2d, #0x0
  e8:	3cda03a4 	ldur	q4, [x29, #-96]
  ec:	6f00e402 	movi	v2.2d, #0x0
  f0:	aa1f03e8 	mov	x8, xzr
  f4:	6f00e403 	movi	v3.2d, #0x0
  f8:	ad41abeb 	ldp	q11, q10, [sp, #48]
  fc:	6e040480 	mov	v0.s[0], v4.s[0]
 100:	6f00e401 	movi	v1.2d, #0x0
 104:	6f00e405 	movi	v5.2d, #0x0
 108:	ad4293ff 	ldp	q31, q4, [sp, #80]
 10c:	6e040482 	mov	v2.s[0], v4.s[0]
 110:	3cdb03a4 	ldur	q4, [x29, #-80]
 114:	ad40b3ed 	ldp	q13, q12, [sp, #16]
 118:	6e040483 	mov	v3.s[0], v4.s[0]
 11c:	6f00e404 	movi	v4.2d, #0x0
 120:	8b080269 	add	x9, x19, x8
 124:	91020108 	add	x8, x8, #0x80
 128:	4eab1d67 	mov	v7.16b, v11.16b
 12c:	f140051f 	cmp	x8, #0x1, lsl #12
 130:	4eab1d7b 	mov	v27.16b, v11.16b
 134:	4cdf0930 	ld4	{v16.4s-v19.4s}, [x9], #64
 138:	4eaad606 	fsub	v6.4s, v16.4s, v10.4s
 13c:	4eacd638 	fsub	v24.4s, v17.4s, v12.4s
 140:	4eadd659 	fsub	v25.4s, v18.4s, v13.4s
 144:	4e26ccc7 	fmla	v7.4s, v6.4s, v6.4s
 148:	4e38cf07 	fmla	v7.4s, v24.4s, v24.4s
 14c:	4c400934 	ld4	{v20.4s-v23.4s}, [x9]
 150:	4e39cf27 	fmla	v7.4s, v25.4s, v25.4s
 154:	6ea1f8e7 	fsqrt	v7.4s, v7.4s
 158:	4eaad69a 	fsub	v26.4s, v20.4s, v10.4s
 15c:	4eacd6bc 	fsub	v28.4s, v21.4s, v12.4s
 160:	4eadd6dd 	fsub	v29.4s, v22.4s, v13.4s
 164:	4e3acf5b 	fmla	v27.4s, v26.4s, v26.4s
 168:	6e27ffe7 	fdiv	v7.4s, v31.4s, v7.4s
 16c:	4e3ccf9b 	fmla	v27.4s, v28.4s, v28.4s
 170:	4e3dcfbb 	fmla	v27.4s, v29.4s, v29.4s
 174:	6ea1fb7b 	fsqrt	v27.4s, v27.4s
 178:	6e27dcfe 	fmul	v30.4s, v7.4s, v7.4s
 17c:	6e27de67 	fmul	v7.4s, v19.4s, v7.4s
 180:	6e27dfc7 	fmul	v7.4s, v30.4s, v7.4s
 184:	6e3bfffb 	fdiv	v27.4s, v31.4s, v27.4s
 188:	4e26cce3 	fmla	v3.4s, v7.4s, v6.4s
 18c:	4e38cce2 	fmla	v2.4s, v7.4s, v24.4s
 190:	4e39cce0 	fmla	v0.4s, v7.4s, v25.4s
 194:	6e3bdf70 	fmul	v16.4s, v27.4s, v27.4s
 198:	6e3bdef1 	fmul	v17.4s, v23.4s, v27.4s
 19c:	6e31de10 	fmul	v16.4s, v16.4s, v17.4s
 1a0:	4e3ace05 	fmla	v5.4s, v16.4s, v26.4s
 1a4:	4e3cce04 	fmla	v4.4s, v16.4s, v28.4s
 1a8:	4e3dce01 	fmla	v1.4s, v16.4s, v29.4s
 1ac:	54fffba1 	b.ne	120 <simplified_nbody+0x120>  // b.any
 1b0:	4e23d4a3 	fadd	v3.4s, v5.4s, v3.4s
 1b4:	4e20d420 	fadd	v0.4s, v1.4s, v0.4s
 1b8:	4e22d482 	fadd	v2.4s, v4.4s, v2.4s
 1bc:	6e20d461 	faddp	v1.4s, v3.4s, v0.4s
 1c0:	6e20d442 	faddp	v2.4s, v2.4s, v0.4s
 1c4:	6e20d400 	faddp	v0.4s, v0.4s, v0.4s
 1c8:	7e30d821 	faddp	s1, v1.2s
 1cc:	7e30d800 	faddp	s0, v0.2s
 1d0:	ad3d07a0 	stp	q0, q1, [x29, #-96]
 1d4:	7e30d841 	faddp	s1, v2.2s
 1d8:	3d801be1 	str	q1, [sp, #96]
 1dc:	d63f0280 	blr	x20
 1e0:	1104035a 	add	w26, w26, #0x100
 1e4:	1100077b 	add	w27, w27, #0x1
 1e8:	6b19035f 	cmp	w26, w25
 1ec:	54fff723 	b.cc	d0 <simplified_nbody+0xd0>  // b.lo, b.ul, b.last
 1f0:	ad7d07a2 	ldp	q2, q1, [x29, #-96]
 1f4:	3dc01be0 	ldr	q0, [sp, #96]
 1f8:	a94007e3 	ldp	x3, x1, [sp]
 1fc:	6e0c0401 	mov	v1.s[1], v0.s[0]
 200:	14000003 	b	20c <simplified_nbody+0x20c>
 204:	2f00e401 	movi	d1, #0x0
 208:	2f00e402 	movi	d2, #0x0
 20c:	5297cee8 	mov	w8, #0xbe77                	// #48759
 210:	bd400060 	ldr	s0, [x3]
 214:	72a7efe8 	movk	w8, #0x3f7f, lsl #16
 218:	8b171029 	add	x9, x1, x23, lsl #4
 21c:	a9524ff4 	ldp	x20, x19, [sp, #288]
 220:	0f801028 	fmla	v8.2s, v1.2s, v0.s[0]
 224:	1f022402 	fmadd	s2, s0, s2, s9
 228:	0e040d01 	dup	v1.2s, w8
 22c:	1e270103 	fmov	s3, w8
 230:	fd400124 	ldr	d4, [x9]
 234:	a94f67fa 	ldp	x26, x25, [sp, #240]
 238:	1e230842 	fmul	s2, s2, s3
 23c:	2e21dd01 	fmul	v1.2s, v8.2s, v1.2s
 240:	bd400923 	ldr	s3, [x9, #8]
 244:	a94e6ffc 	ldp	x28, x27, [sp, #224]
 248:	bd000302 	str	s2, [x24]
 24c:	0f801024 	fmla	v4.2s, v1.2s, v0.s[0]
 250:	1f000c40 	fmadd	s0, s2, s0, s3
 254:	fd0002c1 	str	d1, [x22]
 258:	a95157f6 	ldp	x22, x21, [sp, #272]
 25c:	a9505ff8 	ldp	x24, x23, [sp, #256]
 260:	fd000124 	str	d4, [x9]
 264:	a94d7bfd 	ldp	x29, x30, [sp, #208]
 268:	bd000920 	str	s0, [x9, #8]
 26c:	6d4c23e9 	ldp	d9, d8, [sp, #192]
 270:	6d4b2beb 	ldp	d11, d10, [sp, #176]
 274:	6d4a33ed 	ldp	d13, d12, [sp, #160]
 278:	9104c3ff 	add	sp, sp, #0x130
 27c:	d65f03c0 	ret

Metal / AIR

Note that the compiler would usually directly output a .metallib file. The output below comes from disassembling it with metallib-dis (provided by the toolchain).

; ModuleID = 'bc_module'
source_filename = "simplified_nbody"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.0.0"

%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }

@_ZZ16simplified_nbodyE20local_body_positions = internal addrspace(3) unnamed_addr global [256 x %class.vector4] undef, align 16

; Function Attrs: nounwind
define void @simplified_nbody(%class.vector4 addrspace(1)* noalias nocapture readonly %0, %class.vector4 addrspace(1)* noalias nocapture %1, %class.vector3 addrspace(1)* noalias nocapture %2, float addrspace(2)* noalias nocapture readonly align 4 dereferenceable(4) %3, <3 x i32> %4, <3 x i32> %5, <3 x i32> %6, <3 x i32> %7, <3 x i32> %8, <3 x i32> %9, i32 %10, i32 %11, i32 %12, i32 %13) local_unnamed_addr #0 !reqd_work_group_size !33 !kernel_dim !34 {
  %15 = extractelement <3 x i32> %4, i64 0
  %16 = zext i32 %15 to i64
  %17 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %16, i32 0, i32 0, i32 0
  %18 = bitcast float addrspace(1)* %17 to <3 x float> addrspace(1)*
  %19 = load <3 x float>, <3 x float> addrspace(1)* %18, align 4
  %20 = extractelement <3 x float> %19, i64 0
  %21 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %16, i32 0, i32 0, i32 0
  %22 = bitcast float addrspace(1)* %21 to <3 x float> addrspace(1)*
  %23 = load <3 x float>, <3 x float> addrspace(1)* %22, align 4
  %24 = extractelement <3 x i32> %5, i64 0
  %25 = extractelement <3 x i32> %6, i64 0
  %26 = zext i32 %25 to i64
  %27 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %26, i32 0, i32 0, i32 0
  %28 = bitcast float addrspace(3)* %27 to <4 x float> addrspace(3)*
  %29 = shufflevector <3 x float> %19, <3 x float> undef, <2 x i32> <i32 1, i32 2>
  br label %57

30:                                               ; preds = %68
  %31 = extractelement <3 x float> %23, i64 0
  %32 = load float, float addrspace(2)* %3, align 4
  %33 = fmul fast float %32, %100
  %34 = insertelement <2 x float> undef, float %32, i64 0
  %35 = shufflevector <2 x float> %34, <2 x float> undef, <2 x i32> zeroinitializer
  %36 = fmul fast <2 x float> %35, %101
  %37 = fadd fast float %33, %31
  %38 = shufflevector <3 x float> %23, <3 x float> undef, <2 x i32> <i32 1, i32 2>
  %39 = fadd fast <2 x float> %36, %38
  %40 = fmul fast float %37, 0x3FEFF7CEE0000000
  %41 = fmul fast <2 x float> %39, <float 0x3FEFF7CEE0000000, float 0x3FEFF7CEE0000000>
  %42 = fmul fast float %40, %32
  %43 = fmul fast <2 x float> %41, %35
  %44 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %16, i32 0, i32 0, i32 0
  %45 = bitcast float addrspace(1)* %44 to <3 x float> addrspace(1)*
  %46 = load <3 x float>, <3 x float> addrspace(1)* %45, align 4, !tbaa !35
  %47 = extractelement <3 x float> %46, i64 0
  %48 = fadd fast float %42, %47
  %49 = shufflevector <3 x float> %46, <3 x float> undef, <2 x i32> <i32 1, i32 2>
  %50 = fadd fast <2 x float> %43, %49
  %51 = insertelement <3 x float> undef, float %48, i64 0
  %52 = shufflevector <2 x float> %50, <2 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
  %53 = shufflevector <3 x float> %51, <3 x float> %52, <3 x i32> <i32 0, i32 3, i32 4>
  store <3 x float> %53, <3 x float> addrspace(1)* %45, align 4, !tbaa !35
  %54 = insertelement <3 x float> undef, float %40, i64 0
  %55 = shufflevector <2 x float> %41, <2 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
  %56 = shufflevector <3 x float> %54, <3 x float> %55, <3 x i32> <i32 0, i32 3, i32 4>
  store <3 x float> %56, <3 x float> addrspace(1)* %22, align 4, !tbaa !35
  ret void

57:                                               ; preds = %68, %14
  %58 = phi i32 [ 0, %14 ], [ %69, %68 ]
  %59 = phi i32 [ 0, %14 ], [ %70, %68 ]
  %60 = phi float [ 0.000000e+00, %14 ], [ %100, %68 ]
  %61 = phi <2 x float> [ zeroinitializer, %14 ], [ %101, %68 ]
  %62 = shl i32 %59, 8
  %63 = add i32 %25, %62
  %64 = zext i32 %63 to i64
  %65 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %64, i32 0, i32 0, i32 0
  %66 = bitcast float addrspace(1)* %65 to <4 x float> addrspace(1)*
  %67 = load <4 x float>, <4 x float> addrspace(1)* %66, align 4
  store <4 x float> %67, <4 x float> addrspace(3)* %28, align 4, !tbaa !35
  tail call void @air.wg.barrier(i32 2, i32 1) #3
  br label %72

68:                                               ; preds = %72
  tail call void @air.wg.barrier(i32 2, i32 1) #3
  %69 = add i32 %58, 256
  %70 = add i32 %59, 1
  %71 = icmp ult i32 %69, %24
  br i1 %71, label %57, label %30, !llvm.loop !38

72:                                               ; preds = %72, %57
  %73 = phi i32 [ 0, %57 ], [ %102, %72 ]
  %74 = phi float [ %60, %57 ], [ %100, %72 ]
  %75 = phi <2 x float> [ %61, %57 ], [ %101, %72 ]
  %76 = zext i32 %73 to i64
  %77 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @_ZZ16simplified_nbodyE20local_body_positions, i64 0, i64 %76, i32 0, i32 0, i32 0
  %78 = bitcast float addrspace(3)* %77 to <4 x float> addrspace(3)*
  %79 = load <4 x float>, <4 x float> addrspace(3)* %78, align 4
  %80 = extractelement <4 x float> %79, i64 0
  %81 = extractelement <4 x float> %79, i64 3
  %82 = fsub fast float %80, %20
  %83 = shufflevector <4 x float> %79, <4 x float> undef, <2 x i32> <i32 1, i32 2>
  %84 = fsub fast <2 x float> %83, %29
  %85 = fmul fast float %82, %82
  %86 = fmul fast <2 x float> %84, %84
  %87 = extractelement <2 x float> %86, i64 0
  %88 = extractelement <2 x float> %86, i64 1
  %89 = fadd fast float %85, 0x3F1A36E2E0000000
  %90 = fadd fast float %89, %87
  %91 = fadd fast float %90, %88
  %92 = tail call fast float @air.fast_rsqrt.f32(float %91) #4
  %93 = fmul fast float %92, %92
  %94 = fmul fast float %93, %92
  %95 = fmul fast float %94, %81
  %96 = fmul fast float %95, %82
  %97 = insertelement <2 x float> undef, float %95, i64 0
  %98 = shufflevector <2 x float> %97, <2 x float> undef, <2 x i32> zeroinitializer
  %99 = fmul fast <2 x float> %98, %84
  %100 = fadd fast float %96, %74
  %101 = fadd fast <2 x float> %99, %75
  %102 = add nuw nsw i32 %73, 1
  %103 = icmp eq i32 %102, 256
  br i1 %103, label %68, label %72, !llvm.loop !40
}

; Function Attrs: nounwind readnone
declare float @air.fast_rsqrt.f32(float) local_unnamed_addr #1

; Function Attrs: convergent noduplicate
declare void @air.wg.barrier(i32, i32) local_unnamed_addr #2

attributes #0 = { nounwind "approx-func-fp-math"="true" "frame-pointer"="all" "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="true" }
attributes #1 = { nounwind readnone "approx-func-fp-math"="true" "frame-pointer"="all" "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" }
attributes #2 = { convergent noduplicate "approx-func-fp-math"="true" "frame-pointer"="all" "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" }
attributes #3 = { convergent noduplicate nounwind }
attributes #4 = { nounwind readnone }

!air.kernel = !{!0}
!air.version = !{!18}
!air.language_version = !{!19}
!air.compile_options = !{!20, !21, !22}
!llvm.module.flags = !{!23, !24, !25, !26, !27, !28, !29, !30, !31}
!llvm.ident = !{!32}

!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float addrspace(2)*, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, i32, i32, i32, i32)* @simplified_nbody, !1, !2, !17}
!1 = !{}
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16}
!3 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"float4", !"air.arg_name", !"in_positions"}
!4 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"float4", !"air.arg_name", !"out_positions"}
!5 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 12, !"air.arg_type_align_size", i32 12, !"air.arg_type_name", !"float3", !"air.arg_name", !"inout_velocities"}
!6 = !{i32 3, !"air.buffer", !"air.buffer_size", i32 4, !"air.location_index", i32 3, i32 1, !"air.read", !"air.address_space", i32 2, !"air.arg_type_size", i32 4, !"air.arg_type_align_size", i32 4, !"air.arg_type_name", !"float", !"air.arg_name", !"time_delta"}
!7 = !{i32 4, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_id__"}
!8 = !{i32 5, !"air.threads_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__global_size__"}
!9 = !{i32 6, !"air.thread_position_in_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_id__"}
!10 = !{i32 7, !"air.threads_per_threadgroup", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__local_size__"}
!11 = !{i32 8, !"air.threadgroup_position_in_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_id__"}
!12 = !{i32 9, !"air.threadgroups_per_grid", !"air.arg_type_name", !"uint3", !"air.arg_name", !"__metal__group_size__"}
!13 = !{i32 10, !"air.simdgroup_index_in_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_id__"}
!14 = !{i32 11, !"air.thread_index_in_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_local_id__"}
!15 = !{i32 12, !"air.threads_per_simdgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__sub_group_size__"}
!16 = !{i32 13, !"air.simdgroups_per_threadgroup", !"air.arg_type_name", !"uint", !"air.arg_name", !"__metal__num_sub_groups__"}
!17 = !{!"air.max_work_group_size", i32 256}
!18 = !{i32 2, i32 6, i32 0}
!19 = !{!"Metal", i32 3, i32 1, i32 0}
!20 = !{!"air.compile.denorms_disable"}
!21 = !{!"air.compile.fast_math_enable"}
!22 = !{!"air.compile.framebuffer_fetch_enable"}
!23 = !{i32 7, !"air.max_device_buffers", i32 31}
!24 = !{i32 7, !"air.max_constant_buffers", i32 31}
!25 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!26 = !{i32 7, !"air.max_textures", i32 128}
!27 = !{i32 7, !"air.max_read_write_textures", i32 8}
!28 = !{i32 7, !"air.max_samplers", i32 16}
!29 = !{i32 1, !"wchar_size", i32 4}
!30 = !{i32 7, !"frame-pointer", i32 2}
!31 = !{i32 2, !"SDK Version", [2 x i32] [i32 14, i32 0]}
!32 = !{!"Apple metal version 32023.155 (metalfe-32023.155)"}
!33 = !{i32 256, i32 1, i32 1}
!34 = !{i32 1}
!35 = !{!36, !36, i64 0}
!36 = !{!"omnipotent char", !37, i64 0}
!37 = !{!"Simple C++ TBAA"}
!38 = distinct !{!38, !39}
!39 = !{!"llvm.loop.mustprogress"}
!40 = distinct !{!40, !39}

OpenCL / SPIR

Note that the compiler would usually directly output a .bc file. The output below comes from disassembling it with llvm-dis (provided by the toolchain). Also note that the bitcode file is exported in a LLVM 3.2 / SPIR 1.2 compatible format, but the output below uses LLVM 14.0 syntax.

; ModuleID = 'spir.bc'
source_filename = "spir.bc"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
target triple = "spir64-unknown-unknown"

%class.vector4 = type { %union.anon }
%union.anon = type { %struct.anon }
%struct.anon = type { float, float, float, float }
%class.vector3 = type { %union.anon.8 }
%union.anon.8 = type { %struct.anon.9 }
%struct.anon.9 = type { float, float, float }

@simplified_nbody.local_body_positions = internal unnamed_addr addrspace(3) global [256 x %class.vector4] undef, align 4

define floor_kernel void @simplified_nbody(%class.vector4 addrspace(1)* %0, %class.vector4 addrspace(1)* %1, %class.vector3 addrspace(1)* %2, float %3) {
  %5 = tail call floor_func i64 @_Z13get_global_idj(i32 0), !range !14
  %6 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 0
  %7 = load float, float addrspace(1)* %6, align 4
  %8 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 1
  %9 = load float, float addrspace(1)* %8, align 4
  %10 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %5, i32 0, i32 0, i32 2
  %11 = load float, float addrspace(1)* %10, align 4
  %12 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 0
  %13 = load float, float addrspace(1)* %12, align 4
  %14 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 1
  %15 = load float, float addrspace(1)* %14, align 4
  %16 = getelementptr inbounds %class.vector3, %class.vector3 addrspace(1)* %2, i64 %5, i32 0, i32 0, i32 2
  %17 = load float, float addrspace(1)* %16, align 4
  %18 = tail call floor_func i64 @_Z15get_global_sizej(i32 0), !range !15
  %19 = trunc i64 %18 to i32, !range !16
  %20 = tail call floor_func i64 @_Z12get_local_idj(i32 0), !range !17
  %21 = trunc i64 %20 to i32, !range !18
  %22 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 0
  %23 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 1
  %24 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 2
  %25 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %20, i32 0, i32 0, i32 3
  br label %48

26:                                               ; preds = %65
  %27 = fmul float %98, %3
  %28 = fmul float %99, %3
  %29 = fmul float %100, %3
  %30 = fadd float %27, %13
  %31 = fadd float %28, %15
  %32 = fadd float %29, %17
  %33 = fmul float %30, 0x3FEFF7CEE0000000
  %34 = fmul float %31, 0x3FEFF7CEE0000000
  %35 = fmul float %32, 0x3FEFF7CEE0000000
  %36 = fmul float %33, %3
  %37 = fmul float %34, %3
  %38 = fmul float %35, %3
  %39 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 0
  %40 = load float, float addrspace(1)* %39, align 4, !tbaa !19
  %41 = fadd float %40, %36
  store float %41, float addrspace(1)* %39, align 4, !tbaa !19
  %42 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 1
  %43 = load float, float addrspace(1)* %42, align 4, !tbaa !19
  %44 = fadd float %43, %37
  store float %44, float addrspace(1)* %42, align 4, !tbaa !19
  %45 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %1, i64 %5, i32 0, i32 0, i32 2
  %46 = load float, float addrspace(1)* %45, align 4, !tbaa !19
  %47 = fadd float %46, %38
  store float %47, float addrspace(1)* %45, align 4, !tbaa !19
  store float %33, float addrspace(1)* %12, align 4, !tbaa !19
  store float %34, float addrspace(1)* %14, align 4, !tbaa !19
  store float %35, float addrspace(1)* %16, align 4, !tbaa !19
  ret void

48:                                               ; preds = %65, %4
  %49 = phi i32 [ 0, %4 ], [ %66, %65 ]
  %50 = phi i32 [ 0, %4 ], [ %67, %65 ]
  %51 = phi float [ 0.000000e+00, %4 ], [ %100, %65 ]
  %52 = phi float [ 0.000000e+00, %4 ], [ %99, %65 ]
  %53 = phi float [ 0.000000e+00, %4 ], [ %98, %65 ]
  %54 = shl i32 %50, 8
  %55 = add i32 %54, %21
  %56 = zext i32 %55 to i64
  %57 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 0
  %58 = load float, float addrspace(1)* %57, align 4
  %59 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 1
  %60 = load float, float addrspace(1)* %59, align 4
  %61 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 2
  %62 = load float, float addrspace(1)* %61, align 4
  %63 = getelementptr inbounds %class.vector4, %class.vector4 addrspace(1)* %0, i64 %56, i32 0, i32 0, i32 3
  %64 = load float, float addrspace(1)* %63, align 4
  store float %58, float addrspace(3)* %22, align 4, !tbaa !19
  store float %60, float addrspace(3)* %23, align 4, !tbaa !19
  store float %62, float addrspace(3)* %24, align 4, !tbaa !19
  store float %64, float addrspace(3)* %25, align 4, !tbaa !19
  tail call floor_func void @_Z7barrierj(i32 1)
  br label %69

65:                                               ; preds = %69
  tail call floor_func void @_Z7barrierj(i32 1)
  %66 = add i32 %49, 256
  %67 = add i32 %50, 1
  %68 = icmp ult i32 %66, %19
  br i1 %68, label %48, label %26, !llvm.loop !22

69:                                               ; preds = %69, %48
  %70 = phi i64 [ 0, %48 ], [ %101, %69 ]
  %71 = phi float [ %51, %48 ], [ %100, %69 ]
  %72 = phi float [ %52, %48 ], [ %99, %69 ]
  %73 = phi float [ %53, %48 ], [ %98, %69 ]
  %74 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 0
  %75 = load float, float addrspace(3)* %74, align 4
  %76 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 1
  %77 = load float, float addrspace(3)* %76, align 4
  %78 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 2
  %79 = load float, float addrspace(3)* %78, align 4
  %80 = fsub float %75, %7
  %81 = fsub float %77, %9
  %82 = fsub float %79, %11
  %83 = fmul float %80, %80
  %84 = fmul float %81, %81
  %85 = fmul float %82, %82
  %86 = fadd float %83, 0x3F1A36E2E0000000
  %87 = fadd float %86, %84
  %88 = fadd float %87, %85
  %89 = tail call floor_func float @_Z5rsqrtf(float %88)
  %90 = getelementptr inbounds [256 x %class.vector4], [256 x %class.vector4] addrspace(3)* @simplified_nbody.local_body_positions, i64 0, i64 %70, i32 0, i32 0, i32 3
  %91 = load float, float addrspace(3)* %90, align 4, !tbaa !19
  %92 = fmul float %89, %89
  %93 = fmul float %92, %89
  %94 = fmul float %93, %91
  %95 = fmul float %94, %80
  %96 = fmul float %94, %81
  %97 = fmul float %94, %82
  %98 = fadd float %95, %73
  %99 = fadd float %96, %72
  %100 = fadd float %97, %71
  %101 = add nuw nsw i64 %70, 1
  %102 = icmp eq i64 %101, 256
  br i1 %102, label %65, label %69, !llvm.loop !24
}

declare floor_func i64 @_Z13get_global_idj(i32)

declare floor_func i64 @_Z15get_global_sizej(i32)

declare floor_func i64 @_Z12get_local_idj(i32)

declare floor_func float @_Z5rsqrtf(float)

declare floor_func void @_Z7barrierj(i32)

!opencl.kernels = !{!0}
!llvm.linker.options = !{}
!llvm.module.flags = !{!7, !8}
!opencl.ocl.version = !{!9}
!opencl.spir.version = !{!9}
!opencl.enable.FP_CONTRACT = !{}
!opencl.used.extensions = !{!10}
!opencl.used.optional.core.features = !{!11}
!opencl.compiler.options = !{!12}
!llvm.ident = !{!13}

!0 = !{void (%class.vector4 addrspace(1)*, %class.vector4 addrspace(1)*, %class.vector3 addrspace(1)*, float)* @simplified_nbody, !1, !2, !3, !4, !5, !6}
!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0}
!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
!3 = !{!"kernel_arg_type", !"compute_global_buffer<const float4>", !"compute_global_buffer<float4>", !"compute_global_buffer<float3>", !"param<float>"}
!4 = !{!"kernel_arg_base_type", !"struct __class vector4<float>*", !"struct __class vector4<float>*", !"struct __class vector3<float>*", !"float"}
!5 = !{!"kernel_arg_type_qual", !"restrict const", !"restrict", !"restrict", !"const"}
!6 = !{!"kernel_arg_name", !"in_positions", !"out_positions", !"inout_velocities", !"time_delta"}
!7 = !{i32 1, !"wchar_size", i32 4}
!8 = !{i32 7, !"frame-pointer", i32 2}
!9 = !{i32 1, i32 2}
!10 = !{!"cl_khr_byte_addressable_store", !"cl_khr_global_int32_base_atomics", !"cl_khr_global_int32_extended_atomics", !"cl_khr_local_int32_base_atomics", !"cl_khr_local_int32_extended_atomics", !"cl_khr_fp64", !"cl_khr_fp16", !"cl_khr_gl_msaa_sharing"}
!11 = !{!"cl_doubles"}
!12 = !{!"-cl-kernel-arg-info", !"-cl-mad-enable", !"-cl-denorms-are-zero", !"-cl-unsafe-math-optimizations"}
!13 = !{!"clang version 14.0.6 (https://github.com/a2flo/floor_llvm.git 85a83a4073c340ac03ca1c8fcd131db30339db24)"}
!14 = !{i64 0, i64 4294967295}
!15 = !{i64 1, i64 4294967295}
!16 = !{i32 1, i32 -1}
!17 = !{i64 0, i64 2048}
!18 = !{i32 0, i32 2048}
!19 = !{!20, !20, i64 0}
!20 = !{!"omnipotent char", !21, i64 0}
!21 = !{!"Simple C++ TBAA"}
!22 = distinct !{!22, !23}
!23 = !{!"llvm.loop.mustprogress"}
!24 = distinct !{!24, !23}

OpenCL / SPIR-V

Note that the compiler would usually directly output a .spv file. The output below comes from disassembling it with spirv-dis (provided by the toolchain). Also note that the output below has been generated with extended readability (--debug-asm).

; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 153
; Schema: 0
                                         Capability Addresses
                                         Capability Linkage
                                         Capability Kernel
                                         Capability Int64
                                    %1 = ExtInstImport "OpenCL.std"
                                         MemoryModel Physical64 OpenCL
                                         EntryPoint Kernel %simplified_nbody "simplified_nbody" %__spirv_BuiltInGlobalInvocationId %__spirv_BuiltInGlobalSize %__spirv_BuiltInLocalInvocationId
                                         ExecutionMode %simplified_nbody LocalSize 256 1 1
                                         SourceExtension "cl_khr_byte_addressable_store"
                                         SourceExtension "cl_khr_fp16"
                                         SourceExtension "cl_khr_fp64"
                                         SourceExtension "cl_khr_gl_msaa_sharing"
                                         SourceExtension "cl_khr_global_int32_base_atomics"
                                         SourceExtension "cl_khr_global_int32_extended_atomics"
                                         SourceExtension "cl_khr_local_int32_base_atomics"
                                         SourceExtension "cl_khr_local_int32_extended_atomics"
                                         Source OpenCL_C 102000
                                         Decorate %simplified_nbody.local_body_positions Alignment 4
                                         Decorate %19 FuncParamAttr NoAlias
                                         Decorate %19 FuncParamAttr NoCapture
                                         Decorate %19 FuncParamAttr NoWrite
                                         Decorate %20 FuncParamAttr NoAlias
                                         Decorate %20 FuncParamAttr NoCapture
                                         Decorate %21 FuncParamAttr NoAlias
                                         Decorate %21 FuncParamAttr NoCapture
                                         Decorate %__spirv_BuiltInGlobalInvocationId LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
                                         Decorate %__spirv_BuiltInGlobalInvocationId Constant
                                         Decorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
                                         Decorate %__spirv_BuiltInGlobalSize LinkageAttributes "__spirv_BuiltInGlobalSize" Import
                                         Decorate %__spirv_BuiltInGlobalSize Constant
                                         Decorate %__spirv_BuiltInGlobalSize BuiltIn GlobalSize
                                         Decorate %__spirv_BuiltInLocalInvocationId LinkageAttributes "__spirv_BuiltInLocalInvocationId" Import
                                         Decorate %__spirv_BuiltInLocalInvocationId Constant
                                         Decorate %__spirv_BuiltInLocalInvocationId BuiltIn LocalInvocationId
                                         Decorate %70 FPFastMathMode Fast
                                         Decorate %72 FPFastMathMode Fast
                                         Decorate %74 FPFastMathMode Fast
                                         Decorate %101 FPFastMathMode Fast
                                         Decorate %102 FPFastMathMode Fast
                                         Decorate %103 FPFastMathMode Fast
                                         Decorate %104 FPFastMathMode Fast
                                         Decorate %105 FPFastMathMode Fast
                                         Decorate %106 FPFastMathMode Fast
                                         Decorate %108 FPFastMathMode Fast
                                         Decorate %109 FPFastMathMode Fast
                                         Decorate %110 FPFastMathMode Fast
                                         Decorate %114 FPFastMathMode Fast
                                         Decorate %115 FPFastMathMode Fast
                                         Decorate %116 FPFastMathMode Fast
                                         Decorate %117 FPFastMathMode Fast
                                         Decorate %118 FPFastMathMode Fast
                                         Decorate %119 FPFastMathMode Fast
                                         Decorate %131 FPFastMathMode Fast
                                         Decorate %132 FPFastMathMode Fast
                                         Decorate %133 FPFastMathMode Fast
                                         Decorate %134 FPFastMathMode Fast
                                         Decorate %135 FPFastMathMode Fast
                                         Decorate %136 FPFastMathMode Fast
                                         Decorate %138 FPFastMathMode Fast
                                         Decorate %139 FPFastMathMode Fast
                                         Decorate %140 FPFastMathMode Fast
                                         Decorate %141 FPFastMathMode Fast
                                         Decorate %142 FPFastMathMode Fast
                                         Decorate %143 FPFastMathMode Fast
                                         Decorate %146 FPFastMathMode Fast
                                         Decorate %149 FPFastMathMode Fast
                                         Decorate %152 FPFastMathMode Fast
                                %ulong = TypeInt 64 0
                                 %uint = TypeInt 32 0
                                %256ul = Constant %ulong 256
                                   %0u = Constant %uint 0
                                   %1u = Constant %uint 1
                                   %2u = Constant %uint 2
                                  %0ul = Constant %ulong 0
                                   %3u = Constant %uint 3
                                   %8u = Constant %uint 8
                                 %272u = Constant %uint 272
                                %0ul_0 = Constant %ulong 0
                                  %1ul = Constant %ulong 1
                                 %256u = Constant %uint 256
                                %float = TypeFloat 32
                          %struct.anon = TypeStruct %float %float %float %float
                           %union.anon = TypeStruct %struct.anon
                        %class.vector4 = TypeStruct %union.anon
                 %class.vector4[256ul] = TypeArray %class.vector4 %256ul
     %(Workgroup)class.vector4[256ul]* = TypePointer Workgroup %class.vector4[256ul]
                                 %void = TypeVoid
       %(CrossWorkgroup)class.vector4* = TypePointer CrossWorkgroup %class.vector4
                        %struct.anon.9 = TypeStruct %float %float %float
                         %union.anon.8 = TypeStruct %struct.anon.9
                        %class.vector3 = TypeStruct %union.anon.8
       %(CrossWorkgroup)class.vector3* = TypePointer CrossWorkgroup %class.vector3
                             %void(#4) = TypeFunction %void %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector4* %(CrossWorkgroup)class.vector3* %float
                            %<3xulong> = TypeVector %ulong 3
                    %(Input)<3xulong>* = TypePointer Input %<3xulong>
               %(CrossWorkgroup)float* = TypePointer CrossWorkgroup %float
                    %(Workgroup)float* = TypePointer Workgroup %float
                                 %bool = TypeBool
%simplified_nbody.local_body_positions = Variable %(Workgroup)class.vector4[256ul]* Workgroup
    %__spirv_BuiltInGlobalInvocationId = Variable %(Input)<3xulong>* Input
            %__spirv_BuiltInGlobalSize = Variable %(Input)<3xulong>* Input
     %__spirv_BuiltInLocalInvocationId = Variable %(Input)<3xulong>* Input
                                 %0.0f = Constant %float 0
                      %9.99999975e-05f = Constant %float 9.99999975e-05
                         %0.999000013f = Constant %float 0.999000013

function void simplified_nbody ( %void(#4) ) {
                                   %19 = FunctionParameter %(CrossWorkgroup)class.vector4*
                                   %20 = FunctionParameter %(CrossWorkgroup)class.vector4*
                                   %21 = FunctionParameter %(CrossWorkgroup)class.vector3*
                                   %22 = FunctionParameter %float
23:
                                   %31 = Load %<3xulong> %__spirv_BuiltInGlobalInvocationId Aligned 32
                                   %32 = CompositeExtract %ulong %31 0
                                   %36 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %32 %0u %0u %0u
                                   %37 = Load %float %36 Aligned 4
                                   %39 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %32 %0u %0u %1u
                                   %40 = Load %float %39 Aligned 4
                                   %42 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %32 %0u %0u %2u
                                   %43 = Load %float %42 Aligned 4
                                   %44 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %21 %32 %0u %0u %0u
                                   %45 = Load %float %44 Aligned 4
                                   %46 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %21 %32 %0u %0u %1u
                                   %47 = Load %float %46 Aligned 4
                                   %48 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %21 %32 %0u %0u %2u
                                   %49 = Load %float %48 Aligned 4
                                   %51 = Load %<3xulong> %__spirv_BuiltInGlobalSize Aligned 32
                                   %52 = CompositeExtract %ulong %51 0
                                   %53 = UConvert %uint %52
                                   %55 = Load %<3xulong> %__spirv_BuiltInLocalInvocationId Aligned 32
                                   %56 = CompositeExtract %ulong %55 0
                                   %57 = UConvert %uint %56
                                   %60 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %0u
                                   %61 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %1u
                                   %62 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %2u
                                   %64 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %56 %0u %0u %3u
                                         Branch %24

24:
                                   %66 = Phi %uint ( %65 <- %26, %0u <- %23 )
                                   %68 = Phi %uint ( %67 <- %26, %0u <- %23 )
                                   %71 = Phi %float ( %0.0f <- %23, %70 <- %26 )
                                   %73 = Phi %float ( %0.0f <- %23, %72 <- %26 )
                                   %75 = Phi %float ( %0.0f <- %23, %74 <- %26 )
                                   %77 = ShiftLeftLogical %uint %68 %8u
                                   %78 = IAdd %uint %77 %57
                                   %79 = UConvert %ulong %78
                                   %80 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %0u
                                   %81 = Load %float %80 Aligned 4
                                   %82 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %1u
                                   %83 = Load %float %82 Aligned 4
                                   %84 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %2u
                                   %85 = Load %float %84 Aligned 4
                                   %86 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %19 %79 %0u %0u %3u
                                   %87 = Load %float %86 Aligned 4
                                         Store %60 %81 Aligned 4
                                         Store %61 %83 Aligned 4
                                         Store %62 %85 Aligned 4
                                         Store %64 %87 Aligned 4
                                         ControlBarrier %2u %2u %272u
                                         Branch %25

25:
                                   %91 = Phi %ulong ( %89 <- %25, %0ul_0 <- %24 )
                                   %92 = Phi %float ( %71 <- %24, %70 <- %25 )
                                   %93 = Phi %float ( %73 <- %24, %72 <- %25 )
                                   %94 = Phi %float ( %75 <- %24, %74 <- %25 )
                                   %95 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %0u
                                   %96 = Load %float %95 Aligned 4
                                   %97 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %1u
                                   %98 = Load %float %97 Aligned 4
                                   %99 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %2u
                                  %100 = Load %float %99 Aligned 4
                                  %101 = FSub %float %96 %37
                                  %102 = FSub %float %98 %40
                                  %103 = FSub %float %100 %43
                                  %104 = FMul %float %101 %101
                                  %105 = FMul %float %102 %102
                                  %106 = FMul %float %103 %103
                                  %108 = FAdd %float %104 %9.99999975e-05f
                                  %109 = FAdd %float %108 %105
                                  %110 = FAdd %float %109 %106
                                  %111 = ExtInst %float %1 rsqrt %110
                                  %112 = InBoundsPtrAccessChain %(Workgroup)float* %simplified_nbody.local_body_positions %0ul %91 %0u %0u %3u
                                  %113 = Load %float %112 Aligned 4
                                  %114 = FMul %float %111 %111
                                  %115 = FMul %float %114 %111
                                  %116 = FMul %float %115 %113
                                  %117 = FMul %float %116 %101
                                  %118 = FMul %float %116 %102
                                  %119 = FMul %float %116 %103
                                   %74 = FAdd %float %117 %94
                                   %72 = FAdd %float %118 %93
                                   %70 = FAdd %float %119 %92
                                   %89 = IAdd %ulong %91 %1ul
                                  %126 = IEqual %bool %89 %256ul
                                         BranchConditional %126 %26 %25

26:
                                         ControlBarrier %2u %2u %272u
                                   %65 = IAdd %uint %66 %256u
                                   %67 = IAdd %uint %68 %1u
                                  %130 = ULessThan %bool %65 %53
                                         BranchConditional %130 %24 %27

27:
                                  %131 = FMul %float %74 %22
                                  %132 = FMul %float %72 %22
                                  %133 = FMul %float %70 %22
                                  %134 = FAdd %float %131 %45
                                  %135 = FAdd %float %132 %47
                                  %136 = FAdd %float %133 %49
                                  %138 = FMul %float %134 %0.999000013f
                                  %139 = FMul %float %135 %0.999000013f
                                  %140 = FMul %float %136 %0.999000013f
                                  %141 = FMul %float %138 %22
                                  %142 = FMul %float %139 %22
                                  %143 = FMul %float %140 %22
                                  %144 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %20 %32 %0u %0u %0u
                                  %145 = Load %float %144 Aligned 4
                                  %146 = FAdd %float %145 %141
                                         Store %144 %146 Aligned 4
                                  %147 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %20 %32 %0u %0u %1u
                                  %148 = Load %float %147 Aligned 4
                                  %149 = FAdd %float %148 %142
                                         Store %147 %149 Aligned 4
                                  %150 = InBoundsPtrAccessChain %(CrossWorkgroup)float* %20 %32 %0u %0u %2u
                                  %151 = Load %float %150 Aligned 4
                                  %152 = FAdd %float %151 %143
                                         Store %150 %152 Aligned 4
                                         Store %44 %138 Aligned 4
                                         Store %46 %139 Aligned 4
                                         Store %48 %140 Aligned 4
                                         Return
}

Vulkan / SPIR-V

Note that the compiler would usually directly output a .spvc file (a simple container format for multiple SPIR-V binaries). The output below comes from disassembling it with spirv-dis (provided by the toolchain). Also note that the output below has been generated with extended readability (--debug-asm).

; SPIR-V
; Version: 1.6
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 210
; Schema: 0
                                                Capability Matrix
                                                Capability Shader
                                                Capability Int64
                                                Capability GroupNonUniform
                                                Capability VariablePointersStorageBuffer
                                                Capability VariablePointers
                                                Capability ShaderNonUniform
                                                Capability UniformBufferArrayNonUniformIndexing
                                                Capability SampledImageArrayNonUniformIndexing
                                                Capability StorageBufferArrayNonUniformIndexing
                                                Capability StorageImageArrayNonUniformIndexing
                                                Capability VulkanMemoryModel
                                                Capability VulkanMemoryModelDeviceScope
                                                Capability PhysicalStorageBufferAddresses
                                           %1 = ExtInstImport "GLSL.std.450"
                                                MemoryModel PhysicalStorageBuffer64 Vulkan
                                                EntryPoint GLCompute %simplified_nbody "simplified_nbody" %simplified_nbody.vulkan_uniform. %simplified_nbody.vulkan_uniform..1 %simplified_nbody.vulkan_uniform..2 %simplified_nbody.vulkan_uniform..3 %simplified_nbody.vulkan_builtin_input. %simplified_nbody.vulkan_builtin_input..4 %simplified_nbody.vulkan_builtin_input..5 %simplified_nbody.vulkan_builtin_input..6 %simplified_nbody.vulkan_builtin_input..7 %simplified_nbody.vulkan_builtin_input..8 %vulkan.immutable_sampler_0 %vulkan.immutable_sampler_1 %vulkan.immutable_sampler_2 %vulkan.immutable_sampler_3 %vulkan.immutable_sampler_4 %vulkan.immutable_sampler_5 %vulkan.immutable_sampler_6 %vulkan.immutable_sampler_7 %vulkan.immutable_sampler_8 %vulkan.immutable_sampler_9 %vulkan.immutable_sampler_10 %vulkan.immutable_sampler_11 %vulkan.immutable_sampler_12 %vulkan.immutable_sampler_13 %vulkan.immutable_sampler_14 %vulkan.immutable_sampler_15 %vulkan.immutable_sampler_16 %vulkan.immutable_sampler_17 %vulkan.immutable_sampler_18 %vulkan.immutable_sampler_19 %vulkan.immutable_sampler_20 %vulkan.immutable_sampler_21 %vulkan.immutable_sampler_22 %vulkan.immutable_sampler_23 %vulkan.immutable_sampler_24 %vulkan.immutable_sampler_25 %vulkan.immutable_sampler_26 %vulkan.immutable_sampler_27 %vulkan.immutable_sampler_28 %vulkan.immutable_sampler_29 %vulkan.immutable_sampler_30 %vulkan.immutable_sampler_31 %vulkan.immutable_sampler_32 %vulkan.immutable_sampler_33 %vulkan.immutable_sampler_34 %vulkan.immutable_sampler_35 %vulkan.immutable_sampler_36 %vulkan.immutable_sampler_37 %vulkan.immutable_sampler_38 %vulkan.immutable_sampler_39 %vulkan.immutable_sampler_40 %vulkan.immutable_sampler_41 %vulkan.immutable_sampler_42 %vulkan.immutable_sampler_43 %vulkan.immutable_sampler_44 %vulkan.immutable_sampler_45 %vulkan.immutable_sampler_46 %vulkan.immutable_sampler_47 %_ZZ16simplified_nbodyE20local_body_positions
                                                ExecutionMode %simplified_nbody LocalSize 256 1 1
                                                SourceExtension "vk_capability_int16"
                                                SourceExtension "vk_capability_int64"
                                                SourceExtension "vk_capability_multiview"
                                                Source GLSL 450
                                                Decorate %vulkan.immutable_sampler_0 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_0 Binding 0
                                                Decorate %vulkan.immutable_sampler_1 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_1 Binding 1
                                                Decorate %vulkan.immutable_sampler_2 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_2 Binding 2
                                                Decorate %vulkan.immutable_sampler_3 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_3 Binding 3
                                                Decorate %vulkan.immutable_sampler_4 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_4 Binding 4
                                                Decorate %vulkan.immutable_sampler_5 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_5 Binding 5
                                                Decorate %vulkan.immutable_sampler_6 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_6 Binding 6
                                                Decorate %vulkan.immutable_sampler_7 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_7 Binding 7
                                                Decorate %vulkan.immutable_sampler_8 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_8 Binding 8
                                                Decorate %vulkan.immutable_sampler_9 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_9 Binding 9
                                                Decorate %vulkan.immutable_sampler_10 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_10 Binding 10
                                                Decorate %vulkan.immutable_sampler_11 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_11 Binding 11
                                                Decorate %vulkan.immutable_sampler_12 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_12 Binding 12
                                                Decorate %vulkan.immutable_sampler_13 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_13 Binding 13
                                                Decorate %vulkan.immutable_sampler_14 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_14 Binding 14
                                                Decorate %vulkan.immutable_sampler_15 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_15 Binding 15
                                                Decorate %vulkan.immutable_sampler_16 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_16 Binding 16
                                                Decorate %vulkan.immutable_sampler_17 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_17 Binding 17
                                                Decorate %vulkan.immutable_sampler_18 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_18 Binding 18
                                                Decorate %vulkan.immutable_sampler_19 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_19 Binding 19
                                                Decorate %vulkan.immutable_sampler_20 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_20 Binding 20
                                                Decorate %vulkan.immutable_sampler_21 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_21 Binding 21
                                                Decorate %vulkan.immutable_sampler_22 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_22 Binding 22
                                                Decorate %vulkan.immutable_sampler_23 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_23 Binding 23
                                                Decorate %vulkan.immutable_sampler_24 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_24 Binding 24
                                                Decorate %vulkan.immutable_sampler_25 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_25 Binding 25
                                                Decorate %vulkan.immutable_sampler_26 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_26 Binding 26
                                                Decorate %vulkan.immutable_sampler_27 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_27 Binding 27
                                                Decorate %vulkan.immutable_sampler_28 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_28 Binding 28
                                                Decorate %vulkan.immutable_sampler_29 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_29 Binding 29
                                                Decorate %vulkan.immutable_sampler_30 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_30 Binding 30
                                                Decorate %vulkan.immutable_sampler_31 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_31 Binding 31
                                                Decorate %vulkan.immutable_sampler_32 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_32 Binding 32
                                                Decorate %vulkan.immutable_sampler_33 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_33 Binding 33
                                                Decorate %vulkan.immutable_sampler_34 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_34 Binding 34
                                                Decorate %vulkan.immutable_sampler_35 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_35 Binding 35
                                                Decorate %vulkan.immutable_sampler_36 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_36 Binding 36
                                                Decorate %vulkan.immutable_sampler_37 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_37 Binding 37
                                                Decorate %vulkan.immutable_sampler_38 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_38 Binding 38
                                                Decorate %vulkan.immutable_sampler_39 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_39 Binding 39
                                                Decorate %vulkan.immutable_sampler_40 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_40 Binding 40
                                                Decorate %vulkan.immutable_sampler_41 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_41 Binding 41
                                                Decorate %vulkan.immutable_sampler_42 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_42 Binding 42
                                                Decorate %vulkan.immutable_sampler_43 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_43 Binding 43
                                                Decorate %vulkan.immutable_sampler_44 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_44 Binding 44
                                                Decorate %vulkan.immutable_sampler_45 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_45 Binding 45
                                                Decorate %vulkan.immutable_sampler_46 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_46 Binding 46
                                                Decorate %vulkan.immutable_sampler_47 DescriptorSet 0
                                                Decorate %vulkan.immutable_sampler_47 Binding 47
                                                Decorate %class.vector4[256l] ArrayStride 16
                                                MemberDecorate %class.vector4 0 Offset 0
                                                MemberDecorate %union.anon 0 Offset 0
                                                MemberDecorate %struct.anon 0 Offset 0
                                                MemberDecorate %struct.anon 1 Offset 4
                                                MemberDecorate %struct.anon 2 Offset 8
                                                MemberDecorate %struct.anon 3 Offset 12
                                                Decorate %enclose.class.vector4 Block
                                                MemberDecorate %enclose.class.vector4 0 Offset 0
                                                Decorate %class.vector4[] ArrayStride 16
                                                Decorate %(StorageBuffer)enclose.class.vector4* ArrayStride 16
                                                Decorate %simplified_nbody.vulkan_uniform. NonWritable
                                                Decorate %simplified_nbody.vulkan_uniform. DescriptorSet 1
                                                Decorate %simplified_nbody.vulkan_uniform. Binding 0
                                                Decorate %enclose.class.vector4_0 Block
                                                MemberDecorate %enclose.class.vector4_0 0 Offset 0
                                                Decorate %class.vector4[]_0 ArrayStride 16
                                                Decorate %(StorageBuffer)enclose.class.vector4_0* ArrayStride 16
                                                Decorate %simplified_nbody.vulkan_uniform..1 DescriptorSet 1
                                                Decorate %simplified_nbody.vulkan_uniform..1 Binding 1
                                                Decorate %enclose.class.vector3 Block
                                                MemberDecorate %enclose.class.vector3 0 Offset 0
                                                Decorate %class.vector3[] ArrayStride 12
                                                Decorate %(StorageBuffer)enclose.class.vector3* ArrayStride 12
                                                MemberDecorate %class.vector3 0 Offset 0
                                                MemberDecorate %union.anon.8 0 Offset 0
                                                MemberDecorate %struct.anon.9 0 Offset 0
                                                MemberDecorate %struct.anon.9 1 Offset 4
                                                MemberDecorate %struct.anon.9 2 Offset 8
                                                Decorate %simplified_nbody.vulkan_uniform..2 DescriptorSet 1
                                                Decorate %simplified_nbody.vulkan_uniform..2 Binding 2
                                                Decorate %enclose. Block
                                                MemberDecorate %enclose. 0 Offset 0
                                                Decorate %simplified_nbody.vulkan_uniform..3 NonWritable
                                                Decorate %simplified_nbody.vulkan_uniform..3 Uniform
                                                Decorate %simplified_nbody.vulkan_uniform..3 DescriptorSet 1
                                                Decorate %simplified_nbody.vulkan_uniform..3 Binding 3
                                                Decorate %simplified_nbody.vulkan_builtin_input. BuiltIn WorkgroupId
                                                Decorate %simplified_nbody.vulkan_builtin_input..4 BuiltIn NumWorkgroups
                                                Decorate %simplified_nbody.vulkan_builtin_input..5 BuiltIn SubgroupId
                                                Decorate %simplified_nbody.vulkan_builtin_input..6 BuiltIn SubgroupLocalInvocationId
                                                Decorate %simplified_nbody.vulkan_builtin_input..7 BuiltIn SubgroupSize
                                                Decorate %simplified_nbody.vulkan_builtin_input..8 BuiltIn NumSubgroups
                                                Decorate %(Workgroup)class.vector4[256l]* ArrayStride 4096
                                                Decorate %155 NoSignedWrap
                                                Decorate %155 NoUnsignedWrap
                                       %ilong = TypeInt 64 1
                                        %iint = TypeInt 32 1
                                        %256l = Constant %ilong 256
                                          %8i = Constant %iint 8
                                          %0i = Constant %iint 0
                                          %1i = Constant %iint 1
                                          %2i = Constant %iint 2
                                          %3i = Constant %iint 3
                                       %2504i = Constant %iint 2504
                                          %0l = Constant %ilong 0
                                          %1l = Constant %ilong 1
                                        %256i = Constant %iint 256
                                     %Sampler = TypeSampler
                   %(UniformConstant)Sampler* = TypePointer UniformConstant %Sampler
                                       %float = TypeFloat 32
                                 %struct.anon = TypeStruct %float %float %float %float
                                  %union.anon = TypeStruct %struct.anon
                               %class.vector4 = TypeStruct %union.anon
                         %class.vector4[256l] = TypeArray %class.vector4 %256l
             %(Workgroup)class.vector4[256l]* = TypePointer Workgroup %class.vector4[256l]
                                        %void = TypeVoid
                                      %void() = TypeFunction %void
                             %class.vector4[] = TypeRuntimeArray %class.vector4
                       %enclose.class.vector4 = TypeStruct %class.vector4[]
       %(StorageBuffer)enclose.class.vector4* = TypePointer StorageBuffer %enclose.class.vector4
                           %class.vector4[]_0 = TypeRuntimeArray %class.vector4
                     %enclose.class.vector4_0 = TypeStruct %class.vector4[]_0
     %(StorageBuffer)enclose.class.vector4_0* = TypePointer StorageBuffer %enclose.class.vector4_0
                               %struct.anon.9 = TypeStruct %float %float %float
                                %union.anon.8 = TypeStruct %struct.anon.9
                               %class.vector3 = TypeStruct %union.anon.8
                             %class.vector3[] = TypeRuntimeArray %class.vector3
                       %enclose.class.vector3 = TypeStruct %class.vector3[]
       %(StorageBuffer)enclose.class.vector3* = TypePointer StorageBuffer %enclose.class.vector3
                                    %enclose. = TypeStruct %float
                          %(Uniform)enclose.* = TypePointer Uniform %enclose.
                                    %<3xiint> = TypeVector %iint 3
                            %(Input)<3xiint>* = TypePointer Input %<3xiint>
                                %(Input)iint* = TypePointer Input %iint
                       %(StorageBuffer)float* = TypePointer StorageBuffer %float
                           %(Workgroup)float* = TypePointer Workgroup %float
                                        %bool = TypeBool
                             %(Uniform)float* = TypePointer Uniform %float
                  %vulkan.immutable_sampler_0 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_1 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_2 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_3 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_4 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_5 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_6 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_7 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_8 = Variable %(UniformConstant)Sampler* UniformConstant
                  %vulkan.immutable_sampler_9 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_10 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_11 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_12 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_13 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_14 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_15 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_16 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_17 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_18 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_19 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_20 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_21 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_22 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_23 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_24 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_25 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_26 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_27 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_28 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_29 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_30 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_31 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_32 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_33 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_34 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_35 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_36 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_37 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_38 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_39 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_40 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_41 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_42 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_43 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_44 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_45 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_46 = Variable %(UniformConstant)Sampler* UniformConstant
                 %vulkan.immutable_sampler_47 = Variable %(UniformConstant)Sampler* UniformConstant
%_ZZ16simplified_nbodyE20local_body_positions = Variable %(Workgroup)class.vector4[256l]* Workgroup
            %simplified_nbody.vulkan_uniform. = Variable %(StorageBuffer)enclose.class.vector4* StorageBuffer
          %simplified_nbody.vulkan_uniform..1 = Variable %(StorageBuffer)enclose.class.vector4_0* StorageBuffer
          %simplified_nbody.vulkan_uniform..2 = Variable %(StorageBuffer)enclose.class.vector3* StorageBuffer
          %simplified_nbody.vulkan_uniform..3 = Variable %(Uniform)enclose.* Uniform
      %simplified_nbody.vulkan_builtin_input. = Variable %(Input)<3xiint>* Input
    %simplified_nbody.vulkan_builtin_input..4 = Variable %(Input)<3xiint>* Input
    %simplified_nbody.vulkan_builtin_input..5 = Variable %(Input)iint* Input
    %simplified_nbody.vulkan_builtin_input..6 = Variable %(Input)iint* Input
    %simplified_nbody.vulkan_builtin_input..7 = Variable %(Input)iint* Input
    %simplified_nbody.vulkan_builtin_input..8 = Variable %(Input)iint* Input
                                        %0.0f = Constant %float 0
                             %9.99999975e-05f = Constant %float 9.99999975e-05
                                %0.999000013f = Constant %float 0.999000013

function void simplified_nbody ( %void() ) {
92:
                                          %98 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input. Aligned 16
                                          %99 = CompositeExtract %iint %98 0
                                         %101 = ShiftLeftLogical %iint %99 %8i
                                         %102 = Load %iint %simplified_nbody.vulkan_builtin_input..6 Aligned 4
                                         %103 = Load %iint %simplified_nbody.vulkan_builtin_input..5 Aligned 4
                                         %104 = Load %iint %simplified_nbody.vulkan_builtin_input..7 Aligned 4
                                         %105 = IMul %iint %103 %104
                                         %106 = IAdd %iint %105 %102
                                         %107 = IAdd %iint %101 %106
                                         %108 = Load %<3xiint> %simplified_nbody.vulkan_builtin_input..4 Aligned 16
                                         %109 = CompositeExtract %iint %108 0
                                         %110 = ShiftLeftLogical %iint %109 %8i
                                         %113 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %107 %0i %0i %0i
                                         %115 = Load %float %113 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %116 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %107 %0i %0i %1i
                                         %117 = Load %float %116 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %119 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %107 %0i %0i %2i
                                         %120 = Load %float %119 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %121 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %107 %0i %0i %0i
                                         %122 = Load %float %121 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %123 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %107 %0i %0i %1i
                                         %124 = Load %float %123 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %125 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..2 %0i %0i %107 %0i %0i %2i
                                         %126 = Load %float %125 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %128 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %0i
                                         %129 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %1i
                                         %130 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %2i
                                         %132 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %106 %0i %0i %3i
                                                Branch %93

93:
                                         %134 = Phi %iint ( %133 <- %96, %0i <- %92 )
                                         %136 = Phi %iint ( %135 <- %96, %0i <- %92 )
                                         %139 = Phi %float ( %0.0f <- %92, %138 <- %96 )
                                         %141 = Phi %float ( %0.0f <- %92, %140 <- %96 )
                                         %143 = Phi %float ( %0.0f <- %92, %142 <- %96 )
                                         %144 = ShiftLeftLogical %iint %136 %8i
                                         %145 = IAdd %iint %106 %144
                                         %146 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %0i
                                         %147 = Load %float %146 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %148 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %1i
                                         %149 = Load %float %148 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %150 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %2i
                                         %151 = Load %float %150 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %152 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform. %0i %0i %145 %0i %0i %3i
                                         %153 = Load %float %152 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                                Store %128 %147 Aligned 4
                                                Store %129 %149 Aligned 4
                                                Store %130 %151 Aligned 4
                                                Store %132 %153 Aligned 4
                                                ControlBarrier %2i %2i %2504i
                                                LoopMerge %97 %96 None
                                                Branch %94

94:
                                         %157 = Phi %ilong ( %155 <- %94, %0l <- %93 )
                                         %158 = Phi %float ( %139 <- %93, %138 <- %94 )
                                         %159 = Phi %float ( %141 <- %93, %140 <- %94 )
                                         %160 = Phi %float ( %143 <- %93, %142 <- %94 )
                                         %161 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %0i
                                         %162 = Load %float %161 Aligned 4
                                         %163 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %1i
                                         %164 = Load %float %163 Aligned 4
                                         %165 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %2i
                                         %166 = Load %float %165 Aligned 4
                                         %167 = FSub %float %162 %115
                                         %168 = FSub %float %164 %117
                                         %169 = FSub %float %166 %120
                                         %171 = ExtInst %float %1 Fma %167 %167 %9.99999975e-05f
                                         %172 = ExtInst %float %1 Fma %168 %168 %171
                                         %173 = ExtInst %float %1 Fma %169 %169 %172
                                         %174 = ExtInst %float %1 InverseSqrt %173
                                         %175 = PtrAccessChain %(Workgroup)float* %_ZZ16simplified_nbodyE20local_body_positions %0i %157 %0i %0i %3i
                                         %176 = Load %float %175 Aligned 4
                                         %177 = FMul %float %174 %174
                                         %178 = FMul %float %177 %174
                                         %179 = FMul %float %178 %176
                                         %142 = ExtInst %float %1 Fma %179 %167 %160
                                         %140 = ExtInst %float %1 Fma %179 %168 %159
                                         %138 = ExtInst %float %1 Fma %179 %169 %158
                                         %155 = IAdd %ilong %157 %1l
                                         %186 = IEqual %bool %155 %256l
                                                LoopMerge %95 %94 None
                                                BranchConditional %186 %95 %94

95:
                                                Branch %96

96:
                                                ControlBarrier %2i %2i %2504i
                                         %133 = IAdd %iint %134 %256i
                                         %135 = IAdd %iint %136 %1i
                                         %190 = ULessThan %bool %133 %110
                                                BranchConditional %190 %93 %97

97:
                                         %192 = InBoundsAccessChain %(Uniform)float* %simplified_nbody.vulkan_uniform..3 %0i
                                         %193 = Load %float %192 Aligned 4
                                         %194 = ExtInst %float %1 Fma %193 %142 %122
                                         %195 = ExtInst %float %1 Fma %193 %140 %124
                                         %196 = ExtInst %float %1 Fma %193 %138 %126
                                         %198 = FMul %float %194 %0.999000013f
                                         %199 = FMul %float %195 %0.999000013f
                                         %200 = FMul %float %196 %0.999000013f
                                         %201 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %107 %0i %0i %0i
                                         %202 = Load %float %201 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %203 = ExtInst %float %1 Fma %198 %193 %202
                                                Store %201 %203 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
                                         %204 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %107 %0i %0i %1i
                                         %205 = Load %float %204 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %206 = ExtInst %float %1 Fma %199 %193 %205
                                                Store %204 %206 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
                                         %207 = PtrAccessChain %(StorageBuffer)float* %simplified_nbody.vulkan_uniform..1 %0i %0i %107 %0i %0i %2i
                                         %208 = Load %float %207 Aligned|MakePointerVisible|NonPrivatePointer 4 %1i
                                         %209 = ExtInst %float %1 Fma %200 %193 %208
                                                Store %207 %209 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
                                                Store %121 %198 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
                                                Store %123 %199 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
                                                Store %125 %200 Aligned|MakePointerAvailable|NonPrivatePointer 4 %1i
                                                Return
}

Requirements

OS:
- only AMD64/Intel64/ARM64 are supported
- Windows: NT 6.1+
- macOS: 13.0+
- iOS: 16.0+
- Linux: any current x64 distribution
- other Unix: if other requirements are met
compiler/toolchain:
- Generic: Clang / LLVM / libc++ 16.0+
- macOS/iOS: Xcode 15.0+
- Windows (VS): VS2022 with provided clang/LLVM
- Windows (MinGW): MSYS2 with Clang/LLVM/libc++ 16.0+
libraries and optional requirements:
- SDL3 3.1.1+
- (opt) OpenCL: requires OpenCL 1.2+ SDK and CPU/GPU drivers (Intel, AMD)
- (opt) CUDA: requires sm_50+/Maxwell+ GPU and CUDA 12.0+ drivers (CUDA SDK not required!)
- (opt) Metal: requires iOS 16.0+ or macOS 13.0+, and a Metal 3.0 capable GPU
- (opt) Host-Compute: requires just the compiler/toolchain that is stated above
- (opt) Vulkan: requires 1.3.231+ ICD loader / headers / SDK, volk included as submodule
- (opt) OpenVR: requires OpenVR
- (opt) OpenXR: requires OpenXR

Build Instructions

ensure git submodules are cloned and up-to-date: git submodule update --init --recursive

General / CLI

run ./build.sh (use ./build.sh help to get a list of all options)
configuration of optional parts:
- to disable OpenCL: define FLOOR_NO_OPENCL or ./build.sh no-opencl
- to disable CUDA: define FLOOR_NO_CUDA or ./build.sh no-cuda
- to disable Metal (only affects macOS/iOS builds): define FLOOR_NO_METAL or ./build.sh no-metal
- to disable Host Compute: define FLOOR_NO_HOST_COMPUTE or ./build.sh no-host-compute
- to disable Vulkan: define FLOOR_NO_VULKAN or ./build.sh no-vulkan
- to disable OpenVR: define FLOOR_NO_OPENVR or ./build.sh no-openvr
- to disable OpenXR: define FLOOR_NO_OPENXR or ./build.sh no-openxr
- to build with libstdc++ (GCC 13.0+) instead of libc++: ./build.sh libstdc++

CMake / ninja / CLI

this is provided as an alternative to build.sh and Xcode
create a build folder and cd into it
run cmake -G "Ninja" -S "<path-to-libfloor>" <options>
options:
- to build a static library instead of a shared/dynamic one: -DBUILD_SHARED_LIBS=OFF
- to explicitly use libc++: -DWITH_LIBCXX=ON
- to build with address sanitizer: -DWITH_ASAN=ON
run ninja

Xcode (macOS / iOS)

open floor.xcodeproj and build
some notes:
- almost all optional parts of floor are enabled here and you’ll have to install all dependencies or disable them manually
- Homebrew is the recommended way to install additional dependencies:
  /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
- (opt) download OpenVR and manually install it:
  - mkdir -p {/usr/local/include/openvr,/usr/local/lib}
  - cp openvr/headers/* /usr/local/include/openvr/
  - cp openvr/bin/osx32/libopenvr_api.dylib /usr/local/lib/
- command line tools might be necessary, install them with: xcode-select --install
- on iOS, either copy dependencies into your iPhoneOS and iPhoneSimulator SDK, or floor/ios/deps/{include,lib}
- iOS linker flags for a depending project: -lSDL3 -lfloor

Visual Studio (Windows / CMake / vcpkg)

install Visual Studio 2022
in "Workloads" select "Desktop development with C++", in "Individual components" search for and select all clang packages
install and wait
install Vulkan SDK
install vcpkg (somewhere, not within libfloor):
- git clone https://github.com/Microsoft/vcpkg.git
- cd vcpkg
- .\bootstrap-vcpkg.bat -disableMetrics
- .\vcpkg integrate install
install vcpkg packages:
- .\vcpkg --triplet x64-windows install sdl3 OpenCL vulkan openvr openxr-loader
add a user (or system) environment variable VCPKG_ROOT that points to the vcpkg folder
in Visual Studio: Tools → Options → search for vcpkg and set the custom vcpkg.exe path
in Visual Studio: open folder floor (wait a little until build files are generated)
select Debug or Release configuration and build
NOTE: all dependencies (optional parts) are enabled here
NOTE: having other build environments/systems in PATH (e.g. MSYS2/MinGW) may result in install/build issues

Installation

Installation (Unix / macOS)

sudo mkdir -p /opt/floor/include
sudo ln -sf /path/to/floor /opt/floor/include/floor
sudo ln -sf /path/to/floor/bin /opt/floor/lib
alternatively: copy these files/folders there

Installation (Windows)

create a %%ProgramFiles%%/floor folder (C:/Program Files/floor)
inside this folder:
- create a lib folder
- VS2022:
  - copy everything from bin/ in there (dlls/lib/exp)
- MinGW/MSYS2:
  - copy libfloor_static.a/libfloord_static.a there
- create an include folder and copy the original floor folder in there (containing all floor source code)

Compute/Graphics Toolchain

automated builds for Linux, macOS and Windows can be found at: https://libfloor.org/builds/toolchain
NOTE: this requires a Unix environment with all LLVM build dependencies installed - use MSYS2 on Windows
NOTE: the absolute build path must not contain spaces
compile the toolchain:
- cd floor/etc/llvm140/ && ./build.sh
- if successful, package it (in addition to a .zip file, this also creates a folder with all necessary binaries and include files): ./pkg.sh
install the toolchain:
- Unix:
  - automatic:
    
    development: run ./deploy_dev.sh from the floor/etc/llvm140/ folder (this will create symlinks to everything in floor and floor/etc/llvm140)
    
    release: run ./deploy_pkg.sh from inside the toolchain package folder (floor/etc/llvm140/toolchain_140006_*; this will copy everything)
  - manual:
    
    copy the toolchain folder as toolchain to /opt/floor/ (should then be /opt/floor/toolchain/{bin,clang,libcxx})
    
    inside /opt/floor/toolchain, add a symlink to the floor include folder: sudo ln -sf ../include floor
- Windows:
  - copy the toolchain folder as toolchain to %%ProgramFiles%%/floor (should then be %%ProgramFiles%%/floor/toolchain/{bin,clang,libcxx})
  - inside %%ProgramFiles%%/floor/toolchain, copy the floor folder from the include folder above it into this folder
NOTE: this is the expected default setup - paths can be changed inside config.json (toolchain.generic.paths)

Misc Hints

when using X11 forwarding, set these env variables:
- export SDL_VIDEO_X11_NODIRECTCOLOR=yes
depending on how your Linux distribution handles OpenCL headers and library, you might need to manually install OpenCL 1.2+ compatible ones
Host-Compute device execution requires locked/pinned memory, which may be very limited in default Linux configurations (usually 64KiB)
- libfloor will try to increase the limit to 32MiB per logical CPU core, but this may fail if the max limit is too low
- to increase the max limit, /etc/security/limits.conf must be modified
- as a simple workaround, add the following line to it (replace user_name with your user name) and relog:
  - user_name hard memlock unlimited
- NOTE: when using ssh, PAM must be enabled for this to apply
depending on your Vulkan implementation, you may also need to increase the max amount of open files (usual default is 1024 files)
- libfloor will try to increase the limit to 256 files per logical CPU core, but this may fail if the max limit is too low
- to increase the max limit, /etc/security/limits.conf must be modified
- as a simple workaround, add the following line to it (replace user_name with your user name) and relog:
  - user_name hard nofile unlimited
- NOTE: when using ssh, PAM must be enabled for this to apply

Projects and Examples using libfloor

floor_examples (dnn, nbody, warp, hlbvh, path tracer, other)
libwarp (image-space warping library)
obsolete: oclraster (Flexible Rasterizer in OpenCL)
obsolete: a2elight (Albion 2 Engine)
obsolete: unibot (IRC bot)

a2flo / floor