| Loop Id: 98 | Module: exec | Source: timestep.c:107-116 | Coverage: 0.01% |
|---|
| Loop Id: 98 | Module: exec | Source: timestep.c:107-116 | Coverage: 0.01% |
|---|
0x411d00 MOV %RCX,%RBX |
0x411d03 NOPW %CS:(%RAX,%RAX,1) |
0x411d10 ADD $0x40,%ESI |
0x411d13 CMP %R12,%RDI |
0x411d16 LEA 0x1(%RDI),%RDI |
0x411d1a JE 411c0b |
0x411d20 LEA (%RDI,%RAX,1),%RDX |
0x411d24 MOV (%RBX,%RDX,4),%R14D |
0x411d28 TEST %R14D,%R14D |
0x411d2b JLE 411d10 |
0x411d2d LEA (%RAX,%RDI,1),%R10D |
0x411d31 SAL $0x6,%R10D |
0x411d35 MOV -0x58(%RBP),%R8 |
0x411d39 MOV 0x20(%R8),%RDX |
0x411d3d MOV 0x28(%R8),%R8 |
0x411d41 MOV 0x10(%RDX),%R11 |
0x411d45 MOV 0x20(%RDX),%R9 |
0x411d49 MOV %R14D,%R13D |
0x411d4c VPBROADCASTQ %R10,%ZMM10 |
0x411d52 AND $-0x8,%R13D |
0x411d56 JE 411e80 |
0x411d5c MOV %RBX,%RCX |
0x411d5f MOV %ESI,%R15D |
0x411d62 SAL $0x2,%R15 |
0x411d66 LEA -0x1(%R13),%EBX |
0x411d6a VPADDQ %ZMM4,%ZMM10,%ZMM12 |
0x411d70 ADD %R11,%R15 |
0x411d73 VXORPD %XMM11,%XMM11,%XMM11 |
0x411d78 XOR %EDX,%EDX |
0x411d7a NOPW (%RAX,%RAX,1) |
(99) 0x411d80 VPBROADCASTQ %RDX,%ZMM13 |
(99) 0x411d86 VPADDQ %ZMM13,%ZMM12,%ZMM13 |
(99) 0x411d8c VPSLLQ $0x3,%ZMM13,%ZMM14 |
(99) 0x411d93 VPSLLQ $0x4,%ZMM13,%ZMM13 |
(99) 0x411d9a VPADDQ %ZMM14,%ZMM13,%ZMM13 |
(99) 0x411da0 KXNORW %K0,%K0,%K1 |
(99) 0x411da4 VPXOR %XMM14,%XMM14,%XMM14 |
(99) 0x411da9 VGATHERQPD (%R9,%ZMM13,1),%ZMM14{%K1} |
(99) 0x411db0 KXNORW %K0,%K0,%K1 |
(99) 0x411db4 VXORPD %XMM15,%XMM15,%XMM15 |
(99) 0x411db9 VGATHERQPD 0x8(%R9,%ZMM13,1),%ZMM15{%K1} |
(99) 0x411dc1 KXNORW %K0,%K0,%K1 |
(99) 0x411dc5 VXORPD %XMM16,%XMM16,%XMM16 |
(99) 0x411dcb VGATHERQPD 0x10(%R9,%ZMM13,1),%ZMM16{%K1} |
(99) 0x411dd3 VPMOVSXDQ (%R15,%RDX,4),%ZMM13 |
(99) 0x411dda VMULPD %ZMM14,%ZMM14,%ZMM14 |
(99) 0x411de0 VFMADD213PD %ZMM14,%ZMM15,%ZMM15 |
(99) 0x411de6 VFMADD213PD %ZMM15,%ZMM16,%ZMM16 |
(99) 0x411dec VMULPD %ZMM3,%ZMM16,%ZMM14 |
(99) 0x411df2 VPSLLQ $0x4,%ZMM13,%ZMM13 |
(99) 0x411df9 KXNORW %K0,%K0,%K1 |
(99) 0x411dfd VXORPD %XMM15,%XMM15,%XMM15 |
(99) 0x411e02 VGATHERQPD 0x8(%R8,%ZMM13,1),%ZMM15{%K1} |
(99) 0x411e0a VDIVPD %ZMM15,%ZMM14,%ZMM13 |
(99) 0x411e10 VADDPD %ZMM11,%ZMM13,%ZMM11 |
(99) 0x411e16 ADD $0x8,%RDX |
(99) 0x411e1a CMP %EBX,%EDX |
(99) 0x411e1c JLE 411d80 |
0x411e22 VEXTRACTF64X4 $0x1,%ZMM11,%YMM12 |
0x411e29 VADDPD %ZMM12,%ZMM11,%ZMM11 |
0x411e2f VMOVAPD %XMM11,%XMM12 |
0x411e34 VEXTRACTF128 $0x1,%YMM11,%XMM11 |
0x411e3a VADDPD %XMM11,%XMM12,%XMM11 |
0x411e3f VSHUFPD $0x1,%XMM11,%XMM11,%XMM12 |
0x411e45 VADDSD %XMM12,%XMM11,%XMM11 |
0x411e4a VADDSD %XMM0,%XMM11,%XMM0 |
0x411e4e CMP %R13D,%R14D |
0x411e51 JE 411d00 |
0x411e57 VPBROADCASTD %R14D,%YMM11 |
0x411e5d MOV %RCX,%RBX |
0x411e60 JMP 411e89 |
0x411e80 VPBROADCASTD %R14D,%YMM11 |
0x411e86 XOR %R13D,%R13D |
0x411e89 VPBROADCASTD %R13D,%YMM12 |
0x411e8f VPSUBD %YMM12,%YMM11,%YMM11 |
0x411e94 VPCMPNLEUD %YMM1,%YMM11,%K1 |
0x411e9b KORTESTB %K1,%K1 |
0x411e9f JE 411f80 |
0x411ea5 VPCMPNLEUD %YMM2,%YMM11,%K2 |
0x411eac VPADDD %YMM2,%YMM12,%YMM11 |
0x411eb0 VPMOVSXDQ %YMM11,%ZMM11 |
0x411eb6 VPADDQ %ZMM11,%ZMM10,%ZMM10 |
0x411ebc VPSLLQ $0x3,%ZMM10,%ZMM11 |
0x411ec3 VPSLLQ $0x4,%ZMM10,%ZMM10 |
0x411eca VPADDQ %ZMM11,%ZMM10,%ZMM10 |
0x411ed0 VPXOR %XMM11,%XMM11,%XMM11 |
0x411ed5 KMOVQ %K2,%K3 |
0x411eda VGATHERQPD (%R9,%ZMM10,1),%ZMM11{%K3} |
0x411ee1 MOVSXD %R13D,%RDX |
0x411ee4 ADD %RDX,%R10 |
0x411ee7 VMOVDQU32 (%R11,%R10,4),%YMM12{%K2}{z} |
0x411eee VMOVDQA32 %YMM12,%YMM9{%K2} |
0x411ef4 VMOVAPD %ZMM11,%ZMM8{%K2} |
0x411efa VXORPD %XMM11,%XMM11,%XMM11 |
0x411eff KMOVQ %K2,%K3 |
0x411f04 VGATHERQPD 0x8(%R9,%ZMM10,1),%ZMM11{%K3} |
0x411f0c VPXOR %XMM12,%XMM12,%XMM12 |
0x411f11 KMOVQ %K2,%K3 |
0x411f16 VGATHERQPD 0x10(%R9,%ZMM10,1),%ZMM12{%K3} |
0x411f1e VMULPD %ZMM8,%ZMM8,%ZMM10 |
0x411f24 VMOVAPD %ZMM11,%ZMM7{%K2} |
0x411f2a VFMADD231PD %ZMM7,%ZMM7,%ZMM10 |
0x411f30 VMOVAPD %ZMM12,%ZMM6{%K2} |
0x411f36 VPMOVSXDQ %YMM9,%ZMM11 |
0x411f3c VPSLLQ $0x4,%ZMM11,%ZMM11 |
0x411f43 VXORPD %XMM12,%XMM12,%XMM12 |
0x411f48 KMOVQ %K2,%K3 |
0x411f4d VGATHERQPD 0x8(%R8,%ZMM11,1),%ZMM12{%K3} |
0x411f55 VFMADD231PD %ZMM6,%ZMM6,%ZMM10 |
0x411f5b VMULPD %ZMM3,%ZMM10,%ZMM10 |
0x411f61 VMOVAPD %ZMM12,%ZMM5{%K2} |
0x411f67 VDIVPD %ZMM5,%ZMM10,%ZMM10 |
0x411f6d JMP 411f85 |
0x411f80 VPXOR %XMM10,%XMM10,%XMM10 |
0x411f85 VMOVAPD %ZMM10,%ZMM10{%K1}{z} |
0x411f8b VEXTRACTF64X4 $0x1,%ZMM10,%YMM11 |
0x411f92 VADDPD %ZMM11,%ZMM10,%ZMM10 |
0x411f98 VMOVAPD %XMM10,%XMM11 |
0x411f9d VEXTRACTF128 $0x1,%YMM10,%XMM10 |
0x411fa3 VADDPD %XMM10,%XMM11,%XMM10 |
0x411fa8 VSHUFPD $0x1,%XMM10,%XMM10,%XMM11 |
0x411fae VADDSD %XMM11,%XMM10,%XMM10 |
0x411fb3 VADDSD %XMM0,%XMM10,%XMM0 |
0x411fb7 JMP 411d10 |
/home/eoseret/qaas_runs_CPU_9468/171-148-3214/intel/CoMD/build/CoMD/CoMD/src-openmp/timestep.c: 107 - 116 |
-------------------------------------------------------------------------------- |
107: #pragma omp parallel for reduction(+:kenergy) |
108: for (int iBox=0; iBox<s->boxes->nLocalBoxes; iBox++) |
109: { |
110: for (int iOff=MAXATOMS*iBox,ii=0; ii<s->boxes->nAtoms[iBox]; ii++,iOff++) |
111: { |
112: int iSpecies = s->atoms->iSpecies[iOff]; |
113: real_t invMass = 0.5/s->species[iSpecies].mass; |
114: kenergy += ( s->atoms->p[iOff][0] * s->atoms->p[iOff][0] + |
115: s->atoms->p[iOff][1] * s->atoms->p[iOff][1] + |
116: s->atoms->p[iOff][2] * s->atoms->p[iOff][2] )*invMass; |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.14 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.17 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.12 |
| Bottlenecks | P0, P5, |
| Function | kineticEnergy.extracted |
| Source | timestep.c:107-116 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 21.00 |
| CQA cycles if no scalar integer | 18.50 |
| CQA cycles if FP arith vectorized | 21.00 |
| CQA cycles if fully vectorized | 18.00 |
| Front-end cycles | 18.67 |
| DIV/SQRT cycles | 21.00 |
| P0 cycles | 10.00 |
| P1 cycles | 13.00 |
| P2 cycles | 13.00 |
| P3 cycles | 0.00 |
| P4 cycles | 21.00 |
| P5 cycles | 8.60 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 8.40 |
| P10 cycles | 13.00 |
| P11 cycles | 16.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | 23.10 - 48.78 |
| Stall cycles (UFS) | 4.95 - 30.60 |
| Nb insns | 96.00 |
| Nb uops | 112.00 |
| Nb loads | 11.00 |
| Nb stores | 0.00 |
| Nb stack references | 1.00 |
| FLOP/cycle | 3.81 |
| Nb FLOP add-sub | 24.00 |
| Nb FLOP mul | 16.00 |
| Nb FLOP fma | 16.00 |
| Nb FLOP div | 8.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 15.81 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 332.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 73.02 |
| Vectorization ratio load | 83.33 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | 100.00 |
| Vectorization ratio add_sub | 60.00 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 100.00 |
| Vectorization ratio other | 75.61 |
| Vector-efficiency ratio all | 50.99 |
| Vector-efficiency ratio load | 76.04 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | 100.00 |
| Vector-efficiency ratio add_sub | 47.92 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | 100.00 |
| Vector-efficiency ratio other | 47.26 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.14 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.17 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.12 |
| Bottlenecks | P0, P5, |
| Function | kineticEnergy.extracted |
| Source | timestep.c:107-116 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 21.00 |
| CQA cycles if no scalar integer | 18.50 |
| CQA cycles if FP arith vectorized | 21.00 |
| CQA cycles if fully vectorized | 18.00 |
| Front-end cycles | 18.67 |
| DIV/SQRT cycles | 21.00 |
| P0 cycles | 10.00 |
| P1 cycles | 13.00 |
| P2 cycles | 13.00 |
| P3 cycles | 0.00 |
| P4 cycles | 21.00 |
| P5 cycles | 8.60 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 8.40 |
| P10 cycles | 13.00 |
| P11 cycles | 16.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | 23.10 - 48.78 |
| Stall cycles (UFS) | 4.95 - 30.60 |
| Nb insns | 96.00 |
| Nb uops | 112.00 |
| Nb loads | 11.00 |
| Nb stores | 0.00 |
| Nb stack references | 1.00 |
| FLOP/cycle | 3.81 |
| Nb FLOP add-sub | 24.00 |
| Nb FLOP mul | 16.00 |
| Nb FLOP fma | 16.00 |
| Nb FLOP div | 8.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 15.81 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 332.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 73.02 |
| Vectorization ratio load | 83.33 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | 100.00 |
| Vectorization ratio add_sub | 60.00 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 100.00 |
| Vectorization ratio other | 75.61 |
| Vector-efficiency ratio all | 50.99 |
| Vector-efficiency ratio load | 76.04 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | 100.00 |
| Vector-efficiency ratio add_sub | 47.92 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | 100.00 |
| Vector-efficiency ratio other | 47.26 |
| Path / |
| Function | kineticEnergy.extracted |
| Source file and lines | timestep.c:107-116 |
| Module | exec |
| nb instructions | 96 |
| nb uops | 112 |
| loop length | 491 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 6 |
| used zmm registers | 9 |
| nb stack references | 1 |
| ADD-SUB / MUL ratio | 4.00 |
| micro-operation queue | 18.67 cycles |
| front end | 18.67 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 21.00 | 10.00 | 13.00 | 13.00 | 0.00 | 21.00 | 8.60 | 0.00 | 0.00 | 0.00 | 8.40 | 13.00 |
| cycles | 21.00 | 10.00 | 13.00 | 13.00 | 0.00 | 21.00 | 8.60 | 0.00 | 0.00 | 0.00 | 8.40 | 13.00 |
| Cycles executing div or sqrt instructions | 16.00 |
| FE+BE cycles | 23.10-48.78 |
| Stall cycles | 4.94-30.60 |
| ROB full (events) | 5.47-32.82 |
| Front-end | 18.67 |
| Dispatch | 21.00 |
| DIV/SQRT | 16.00 |
| Overall L1 | 21.00 |
| all | 56% |
| load | 50% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 71% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 52% |
| all | 87% |
| load | 100% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 50% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 73% |
| load | 83% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 60% |
| fma | 100% |
| div/sqrt | 100% |
| other | 75% |
| all | 39% |
| load | 28% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 33% |
| all | 61% |
| load | 100% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 37% |
| fma | 100% |
| div/sqrt | 100% |
| other | 61% |
| all | 50% |
| load | 76% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 47% |
| fma | 100% |
| div/sqrt | 100% |
| other | 47% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| ADD $0x40,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| CMP %R12,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| LEA 0x1(%RDI),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| JE 411c0b <kineticEnergy.extracted+0x7b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA (%RDI,%RAX,1),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| MOV (%RBX,%RDX,4),%R14D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| TEST %R14D,%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JLE 411d10 <kineticEnergy.extracted+0x180> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA (%RAX,%RDI,1),%R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| SAL $0x6,%R10D | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
| MOV -0x58(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x20(%R8),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x28(%R8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10(%RDX),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x20(%RDX),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R14D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| VPBROADCASTQ %R10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| AND $-0x8,%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 411e80 <kineticEnergy.extracted+0x2f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV %RBX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %ESI,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| SAL $0x2,%R15 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
| LEA -0x1(%R13),%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| VPADDQ %ZMM4,%ZMM10,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| ADD %R11,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VXORPD %XMM11,%XMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VEXTRACTF64X4 $0x1,%ZMM11,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %ZMM12,%ZMM11,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVAPD %XMM11,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VEXTRACTF128 $0x1,%YMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %XMM11,%XMM12,%XMM11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VSHUFPD $0x1,%XMM11,%XMM11,%XMM12 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VADDSD %XMM12,%XMM11,%XMM11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VADDSD %XMM0,%XMM11,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| CMP %R13D,%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JE 411d00 <kineticEnergy.extracted+0x170> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VPBROADCASTD %R14D,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JMP 411e89 <kineticEnergy.extracted+0x2f9> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| VPBROADCASTD %R14D,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VPBROADCASTD %R13D,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSUBD %YMM12,%YMM11,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
| VPCMPNLEUD %YMM1,%YMM11,%K1 | |||||||||||||||
| KORTESTB %K1,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| JE 411f80 <kineticEnergy.extracted+0x3f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VPCMPNLEUD %YMM2,%YMM11,%K2 | |||||||||||||||
| VPADDD %YMM2,%YMM12,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VPMOVSXDQ %YMM11,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPADDQ %ZMM11,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPSLLQ $0x3,%ZMM10,%ZMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
| VPSLLQ $0x4,%ZMM10,%ZMM10 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
| VPADDQ %ZMM11,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPXOR %XMM11,%XMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD (%R9,%ZMM10,1),%ZMM11{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| MOVSXD %R13D,%RDX | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| ADD %RDX,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVDQU32 (%R11,%R10,4),%YMM12{%K2}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
| VMOVDQA32 %YMM12,%YMM9{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VMOVAPD %ZMM11,%ZMM8{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VXORPD %XMM11,%XMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD 0x8(%R9,%ZMM10,1),%ZMM11{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD 0x10(%R9,%ZMM10,1),%ZMM12{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VMULPD %ZMM8,%ZMM8,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM11,%ZMM7{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VFMADD231PD %ZMM7,%ZMM7,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM12,%ZMM6{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VPMOVSXDQ %YMM9,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSLLQ $0x4,%ZMM11,%ZMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
| VXORPD %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD 0x8(%R8,%ZMM11,1),%ZMM12{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VFMADD231PD %ZMM6,%ZMM6,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULPD %ZMM3,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM12,%ZMM5{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VDIVPD %ZMM5,%ZMM10,%ZMM10 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
| JMP 411f85 <kineticEnergy.extracted+0x3f5> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| VPXOR %XMM10,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VMOVAPD %ZMM10,%ZMM10{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VEXTRACTF64X4 $0x1,%ZMM10,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %ZMM11,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVAPD %XMM10,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VEXTRACTF128 $0x1,%YMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %XMM10,%XMM11,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VADDSD %XMM11,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VADDSD %XMM0,%XMM10,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| JMP 411d10 <kineticEnergy.extracted+0x180> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| Function | kineticEnergy.extracted |
| Source file and lines | timestep.c:107-116 |
| Module | exec |
| nb instructions | 96 |
| nb uops | 112 |
| loop length | 491 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 4 |
| used ymm registers | 6 |
| used zmm registers | 9 |
| nb stack references | 1 |
| ADD-SUB / MUL ratio | 4.00 |
| micro-operation queue | 18.67 cycles |
| front end | 18.67 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 21.00 | 10.00 | 13.00 | 13.00 | 0.00 | 21.00 | 8.60 | 0.00 | 0.00 | 0.00 | 8.40 | 13.00 |
| cycles | 21.00 | 10.00 | 13.00 | 13.00 | 0.00 | 21.00 | 8.60 | 0.00 | 0.00 | 0.00 | 8.40 | 13.00 |
| Cycles executing div or sqrt instructions | 16.00 |
| FE+BE cycles | 23.10-48.78 |
| Stall cycles | 4.94-30.60 |
| ROB full (events) | 5.47-32.82 |
| Front-end | 18.67 |
| Dispatch | 21.00 |
| DIV/SQRT | 16.00 |
| Overall L1 | 21.00 |
| all | 56% |
| load | 50% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 71% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 52% |
| all | 87% |
| load | 100% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 50% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 73% |
| load | 83% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 60% |
| fma | 100% |
| div/sqrt | 100% |
| other | 75% |
| all | 39% |
| load | 28% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 59% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 33% |
| all | 61% |
| load | 100% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 37% |
| fma | 100% |
| div/sqrt | 100% |
| other | 61% |
| all | 50% |
| load | 76% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 100% |
| add-sub | 47% |
| fma | 100% |
| div/sqrt | 100% |
| other | 47% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| ADD $0x40,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| CMP %R12,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| LEA 0x1(%RDI),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| JE 411c0b <kineticEnergy.extracted+0x7b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA (%RDI,%RAX,1),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| MOV (%RBX,%RDX,4),%R14D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| TEST %R14D,%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JLE 411d10 <kineticEnergy.extracted+0x180> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA (%RAX,%RDI,1),%R10D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| SAL $0x6,%R10D | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
| MOV -0x58(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x20(%R8),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x28(%R8),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10(%RDX),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x20(%RDX),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R14D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| VPBROADCASTQ %R10,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| AND $-0x8,%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 411e80 <kineticEnergy.extracted+0x2f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV %RBX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %ESI,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| SAL $0x2,%R15 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
| LEA -0x1(%R13),%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| VPADDQ %ZMM4,%ZMM10,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| ADD %R11,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VXORPD %XMM11,%XMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VEXTRACTF64X4 $0x1,%ZMM11,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %ZMM12,%ZMM11,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVAPD %XMM11,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VEXTRACTF128 $0x1,%YMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %XMM11,%XMM12,%XMM11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VSHUFPD $0x1,%XMM11,%XMM11,%XMM12 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VADDSD %XMM12,%XMM11,%XMM11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VADDSD %XMM0,%XMM11,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| CMP %R13D,%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JE 411d00 <kineticEnergy.extracted+0x170> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VPBROADCASTD %R14D,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RCX,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JMP 411e89 <kineticEnergy.extracted+0x2f9> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| VPBROADCASTD %R14D,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VPBROADCASTD %R13D,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSUBD %YMM12,%YMM11,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
| VPCMPNLEUD %YMM1,%YMM11,%K1 | |||||||||||||||
| KORTESTB %K1,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| JE 411f80 <kineticEnergy.extracted+0x3f0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VPCMPNLEUD %YMM2,%YMM11,%K2 | |||||||||||||||
| VPADDD %YMM2,%YMM12,%YMM11 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VPMOVSXDQ %YMM11,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPADDQ %ZMM11,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPSLLQ $0x3,%ZMM10,%ZMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
| VPSLLQ $0x4,%ZMM10,%ZMM10 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
| VPADDQ %ZMM11,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPXOR %XMM11,%XMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD (%R9,%ZMM10,1),%ZMM11{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| MOVSXD %R13D,%RDX | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| ADD %RDX,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVDQU32 (%R11,%R10,4),%YMM12{%K2}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
| VMOVDQA32 %YMM12,%YMM9{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VMOVAPD %ZMM11,%ZMM8{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VXORPD %XMM11,%XMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD 0x8(%R9,%ZMM10,1),%ZMM11{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD 0x10(%R9,%ZMM10,1),%ZMM12{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VMULPD %ZMM8,%ZMM8,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM11,%ZMM7{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VFMADD231PD %ZMM7,%ZMM7,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM12,%ZMM6{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VPMOVSXDQ %YMM9,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSLLQ $0x4,%ZMM11,%ZMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 1 |
| VXORPD %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| KMOVQ %K2,%K3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VGATHERQPD 0x8(%R8,%ZMM11,1),%ZMM12{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VFMADD231PD %ZMM6,%ZMM6,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULPD %ZMM3,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM12,%ZMM5{%K2} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VDIVPD %ZMM5,%ZMM10,%ZMM10 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
| JMP 411f85 <kineticEnergy.extracted+0x3f5> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| VPXOR %XMM10,%XMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VMOVAPD %ZMM10,%ZMM10{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VEXTRACTF64X4 $0x1,%ZMM10,%YMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %ZMM11,%ZMM10,%ZMM10 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVAPD %XMM10,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VEXTRACTF128 $0x1,%YMM10,%XMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VADDPD %XMM10,%XMM11,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VSHUFPD $0x1,%XMM10,%XMM10,%XMM11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VADDSD %XMM11,%XMM10,%XMM10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VADDSD %XMM0,%XMM10,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| JMP 411d10 <kineticEnergy.extracted+0x180> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
