| Loop Id: 850 | Module: libkripke.so | Source: forall.hpp:59-59 [...] | Coverage: 0.01% |
|---|
| Loop Id: 850 | Module: libkripke.so | Source: forall.hpp:59-59 [...] | Coverage: 0.01% |
|---|
0x4b250 ADD -0x38(%RBP),%R14 |
0x4b254 CMP -0x128(%RBP),%RDI |
0x4b25b LEA 0x1(%RDI),%RDI |
0x4b25f JE 4b170 |
0x4b265 MOV %RSI,%RCX |
0x4b268 IMUL %RDI,%RCX |
0x4b26c ADD %RBX,%RCX |
0x4b26f ADD -0x80(%RBP),%RCX |
0x4b273 VMOVSD (%R13,%RCX,8),%XMM0 |
0x4b27a MOV %R8,%R9 |
0x4b27d AND $-0x4,%R9 |
0x4b281 JE 4b2c0 |
0x4b283 LEA -0x1(%R9),%RCX |
0x4b287 VBROADCASTSD %XMM0,%YMM1 |
0x4b28c XOR %ESI,%ESI |
0x4b28e XCHG %AX,%AX |
(851) 0x4b290 VMOVUPD (%R14,%RSI,8),%YMM2 |
(851) 0x4b296 VFMADD213PD (%RDX,%RSI,8),%YMM1,%YMM2 |
(851) 0x4b29c VMOVUPD %YMM2,(%RDX,%RSI,8) |
(851) 0x4b2a1 ADD $0x4,%RSI |
(851) 0x4b2a5 CMP %RCX,%RSI |
(851) 0x4b2a8 JLE 4b290 |
0x4b2aa CMP %R9,%R8 |
0x4b2ad MOV -0x40(%RBP),%RSI |
0x4b2b1 MOV -0x48(%RBP),%RCX |
0x4b2b5 JNE 4b2d0 |
0x4b2b7 JMP 4b250 |
0x4b2c0 XOR %R9D,%R9D |
0x4b2c3 MOV -0x48(%RBP),%RCX |
0x4b2c7 NOPW (%RAX,%RAX,1) |
(849) 0x4b2d0 VMOVSD (%R14,%R9,8),%XMM1 |
(849) 0x4b2d6 VFMADD213SD (%RDX,%R9,8),%XMM0,%XMM1 |
(849) 0x4b2dc VMOVSD %XMM1,(%RDX,%R9,8) |
(849) 0x4b2e2 INC %R9 |
(849) 0x4b2e5 CMP %R9,%RCX |
(849) 0x4b2e8 JNE 4b2d0 |
0x4b2ea JMP 4b250 |
/scratch_na/users/xoserete/qaas_runs/171-319-6990/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/policy/openmp/kernel/Collapse.hpp: 81 - 81 |
-------------------------------------------------------------------------------- |
81: #pragma omp parallel for private(i0, i1) firstprivate(privatizer) \ |
/scratch_na/users/xoserete/qaas_runs/171-319-6990/intel/Kripke/build/Kripke/src/Kripke/Kernel/LTimes.cpp: 62 - 62 |
-------------------------------------------------------------------------------- |
62: phi(nm,g,z) += ell(nm, d) * psi(d, g, z); |
/scratch_na/users/xoserete/qaas_runs/171-319-6990/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/policy/loop/forall.hpp: 59 - 59 |
-------------------------------------------------------------------------------- |
59: for (decltype(distance_it) i = 0; i < distance_it; ++i) { |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.20 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 12.89 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.46 |
| Bottlenecks | |
| Function | void Kripke::DispatchHelper |
| Source | Collapse.hpp:81-81,LTimes.cpp:62-62,forall.hpp:59-59 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.22 |
| CQA cycles if no scalar integer | 1.47 |
| CQA cycles if FP arith vectorized | 3.22 |
| CQA cycles if fully vectorized | 0.25 |
| Front-end cycles | 3.22 |
| DIV/SQRT cycles | 2.07 |
| P0 cycles | 2.13 |
| P1 cycles | 1.89 |
| P2 cycles | 1.89 |
| P3 cycles | 0.00 |
| P4 cycles | 2.00 |
| P5 cycles | 1.93 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 2.00 |
| P10 cycles | 1.89 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | 3.40 - 3.39 |
| Stall cycles (UFS) | 0.00 |
| Nb insns | 19.33 |
| Nb uops | 19.33 |
| Nb loads | 5.67 |
| Nb stores | 0.00 |
| Nb stack references | 4.67 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 14.14 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 45.33 |
| Bytes stored | 0.00 |
| Stride 0 | 1.00 |
| Stride 1 | 1.33 |
| Stride n | 0.00 |
| Stride unknown | 2.33 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 11.46 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 10.59 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.90 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 12.80 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.33 |
| Bottlenecks | micro-operation queue, |
| Function | void Kripke::DispatchHelper |
| Source | Collapse.hpp:81-81,LTimes.cpp:62-62,forall.hpp:59-59 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 2.67 |
| CQA cycles if no scalar integer | 1.40 |
| CQA cycles if FP arith vectorized | 2.67 |
| CQA cycles if fully vectorized | 0.21 |
| Front-end cycles | 2.67 |
| DIV/SQRT cycles | 1.60 |
| P0 cycles | 2.00 |
| P1 cycles | 1.67 |
| P2 cycles | 1.67 |
| P3 cycles | 0.00 |
| P4 cycles | 1.60 |
| P5 cycles | 1.60 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 1.60 |
| P10 cycles | 1.67 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | 2.81 |
| Stall cycles (UFS) | 0.00 |
| Nb insns | 16.00 |
| Nb uops | 16.00 |
| Nb loads | 5.00 |
| Nb stores | 0.00 |
| Nb stack references | 4.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 15.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 40.00 |
| Bytes stored | 0.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 3.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 11.46 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 10.94 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.33 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 12.92 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.52 |
| Bottlenecks | micro-operation queue, |
| Function | void Kripke::DispatchHelper |
| Source | Collapse.hpp:81-81,LTimes.cpp:62-62,forall.hpp:59-59 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.50 |
| CQA cycles if no scalar integer | 1.50 |
| CQA cycles if FP arith vectorized | 3.50 |
| CQA cycles if fully vectorized | 0.27 |
| Front-end cycles | 3.50 |
| DIV/SQRT cycles | 2.30 |
| P0 cycles | 2.20 |
| P1 cycles | 2.00 |
| P2 cycles | 2.00 |
| P3 cycles | 0.00 |
| P4 cycles | 2.20 |
| P5 cycles | 2.10 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 2.20 |
| P10 cycles | 2.00 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | 3.70 - 3.68 |
| Stall cycles (UFS) | 0.00 |
| Nb insns | 21.00 |
| Nb uops | 21.00 |
| Nb loads | 6.00 |
| Nb stores | 0.00 |
| Nb stack references | 5.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 13.71 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 48.00 |
| Bytes stored | 0.00 |
| Stride 0 | 1.00 |
| Stride 1 | 2.00 |
| Stride n | 0.00 |
| Stride unknown | 1.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 11.46 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 10.42 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.33 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 12.92 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.52 |
| Bottlenecks | micro-operation queue, |
| Function | void Kripke::DispatchHelper |
| Source | Collapse.hpp:81-81,LTimes.cpp:62-62,forall.hpp:59-59 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 3.50 |
| CQA cycles if no scalar integer | 1.50 |
| CQA cycles if FP arith vectorized | 3.50 |
| CQA cycles if fully vectorized | 0.27 |
| Front-end cycles | 3.50 |
| DIV/SQRT cycles | 2.30 |
| P0 cycles | 2.20 |
| P1 cycles | 2.00 |
| P2 cycles | 2.00 |
| P3 cycles | 0.00 |
| P4 cycles | 2.20 |
| P5 cycles | 2.10 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 2.20 |
| P10 cycles | 2.00 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 1 |
| FE+BE cycles (UFS) | 3.70 - 3.68 |
| Stall cycles (UFS) | 0.00 |
| Nb insns | 21.00 |
| Nb uops | 21.00 |
| Nb loads | 6.00 |
| Nb stores | 0.00 |
| Nb stack references | 5.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 13.71 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 48.00 |
| Bytes stored | 0.00 |
| Stride 0 | 1.00 |
| Stride 1 | 2.00 |
| Stride n | 0.00 |
| Stride unknown | 3.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 11.46 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 10.42 |
| Path / |
| nb instructions | 19.33 |
| nb uops | 19.33 |
| loop length | 77.67 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 0.67 |
| used zmm registers | 0 |
| nb stack references | 4.67 |
| micro-operation queue | 3.22 cycles |
| front end | 3.22 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.07 | 2.00 | 1.89 | 1.89 | 0.00 | 2.00 | 1.93 | 0.00 | 0.00 | 0.00 | 2.00 | 1.89 |
| cycles | 2.07 | 2.13 | 1.89 | 1.89 | 0.00 | 2.00 | 1.93 | 0.00 | 0.00 | 0.00 | 2.00 | 1.89 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 1.00 |
| FE+BE cycles | 3.40-3.39 |
| Stall cycles | 0.00 |
| Front-end | 3.22 |
| Dispatch | 2.20 |
| Data deps. | 1.00 |
| Overall L1 | 3.22 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 12% |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| nb instructions | 16 |
| nb uops | 16 |
| loop length | 72 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 4 |
| micro-operation queue | 2.67 cycles |
| front end | 2.67 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.60 | 1.60 | 1.67 | 1.67 | 0.00 | 1.60 | 1.60 | 0.00 | 0.00 | 0.00 | 1.60 | 1.67 |
| cycles | 1.60 | 2.00 | 1.67 | 1.67 | 0.00 | 1.60 | 1.60 | 0.00 | 0.00 | 0.00 | 1.60 | 1.67 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 1.00 |
| FE+BE cycles | 2.81 |
| Stall cycles | 0.00 |
| Front-end | 2.67 |
| Dispatch | 2.00 |
| Data deps. | 1.00 |
| Overall L1 | 2.67 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | NA (no other vectorizable/vectorized instructions) |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | NA (no other vectorizable/vectorized instructions) |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ADD -0x38(%RBP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| CMP -0x128(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| LEA 0x1(%RDI),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| JE 4b170 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x680> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV %RSI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %RDI,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %RBX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD -0x80(%RBP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVSD (%R13,%RCX,8),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| AND $-0x4,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 4b2c0 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x7d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| MOV -0x48(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| JMP 4b250 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x760> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| nb instructions | 21 |
| nb uops | 21 |
| loop length | 79 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 1 |
| used zmm registers | 0 |
| nb stack references | 5 |
| micro-operation queue | 3.50 cycles |
| front end | 3.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.30 | 2.20 | 2.00 | 2.00 | 0.00 | 2.20 | 2.10 | 0.00 | 0.00 | 0.00 | 2.20 | 2.00 |
| cycles | 2.30 | 2.20 | 2.00 | 2.00 | 0.00 | 2.20 | 2.10 | 0.00 | 0.00 | 0.00 | 2.20 | 2.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 1.00 |
| FE+BE cycles | 3.70-3.68 |
| Stall cycles | 0.00 |
| Front-end | 3.50 |
| Dispatch | 2.30 |
| Data deps. | 1.00 |
| Overall L1 | 3.50 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 10% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 12% |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ADD -0x38(%RBP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| CMP -0x128(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| LEA 0x1(%RDI),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| JE 4b170 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x680> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV %RSI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %RDI,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %RBX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD -0x80(%RBP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVSD (%R13,%RCX,8),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| AND $-0x4,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 4b2c0 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x7d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA -0x1(%R9),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VBROADCASTSD %XMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| CMP %R9,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV -0x40(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV -0x48(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| JNE 4b2d0 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x7e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| JMP 4b250 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x760> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| nb instructions | 21 |
| nb uops | 21 |
| loop length | 82 |
| used x86 registers | 10 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 1 |
| used zmm registers | 0 |
| nb stack references | 5 |
| micro-operation queue | 3.50 cycles |
| front end | 3.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.30 | 2.20 | 2.00 | 2.00 | 0.00 | 2.20 | 2.10 | 0.00 | 0.00 | 0.00 | 2.20 | 2.00 |
| cycles | 2.30 | 2.20 | 2.00 | 2.00 | 0.00 | 2.20 | 2.10 | 0.00 | 0.00 | 0.00 | 2.20 | 2.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 1.00 |
| FE+BE cycles | 3.70-3.68 |
| Stall cycles | 0.00 |
| Front-end | 3.50 |
| Dispatch | 2.30 |
| Data deps. | 1.00 |
| Overall L1 | 3.50 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 10% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 9% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 12% |
| all | 11% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ADD -0x38(%RBP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| CMP -0x128(%RBP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| LEA 0x1(%RDI),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| JE 4b170 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x680> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV %RSI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %RDI,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %RBX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD -0x80(%RBP),%RCX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVSD (%R13,%RCX,8),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| AND $-0x4,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 4b2c0 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x7d0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA -0x1(%R9),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VBROADCASTSD %XMM0,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| XOR %ESI,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| CMP %R9,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV -0x40(%RBP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV -0x48(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| JNE 4b2d0 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x7e0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| JMP 4b250 <_ZNK6Kripke14DispatchHelperINS_12ArchT_OpenMPEEclINS_11LayoutT_DGZE10LTimesSdomJRNS_6SdomIdERKNS_4Core3SetESB_SB_SB_RNS8_5FieldIdJNS_9DirectionENS_5GroupENS_4ZoneEEEERNSC_IdJNS_6MomentESE_SF_EEERNSC_IdJSI_SD_EEEEEEvT_RKT0_DpOT1_.extracted+0x760> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
