| Loop Id: 176 | Module: exec | Source: advec_mom_kernel.f90:81-248 [...] | Coverage: 0.01% |
|---|
| Loop Id: 176 | Module: exec | Source: advec_mom_kernel.f90:81-248 [...] | Coverage: 0.01% |
|---|
0x433b80 CLTQ |
0x433b82 MOV 0x40(%RSP),%RCX |
0x433b87 LEA 0x1(%RAX,%RCX,1),%RDX |
0x433b8c SUB 0x2c0(%RSP),%RAX |
0x433b94 IMUL %RAX,%RDI |
0x433b98 IMUL %RAX,%RBX |
0x433b9c IMUL %R12,%RDX |
0x433ba0 IMUL %RAX,%R12 |
0x433ba4 IMUL %RAX,%R10 |
0x433ba8 MOV %R10,%R9 |
0x433bab MOV %RBX,%R11 |
0x433bae MOV %RDI,%RAX |
0x433bb1 XOR %EDI,%EDI |
0x433bb3 VPBROADCASTQ %RDI,%ZMM7 |
0x433bb9 VPSUBQ %ZMM7,%ZMM0,%ZMM7 |
0x433bbf VPCMPNLEUQ %ZMM1,%ZMM7,%K1 |
0x433bc6 ADD 0xe0(%RSP),%RAX |
0x433bce ADD 0x18(%RSP),%RDI |
0x433bd3 SUB 0x98(%RSP),%RDI |
0x433bdb VMOVUPD (%RAX,%RDI,8),%ZMM7{%K1}{z} |
0x433be2 VMOVAPD %ZMM7,%ZMM6{%K1} |
0x433be8 ADD 0x130(%RSP),%R11 |
0x433bf0 VMOVUPD (%R11,%RDI,8),%ZMM7{%K1}{z} |
0x433bf7 VMOVAPD %ZMM7,%ZMM5{%K1} |
0x433bfd MOV 0x148(%RSP),%RAX |
0x433c05 ADD %RAX,%RDX |
0x433c08 VMOVUPD (%RDX,%RDI,8),%ZMM7{%K1}{z} |
0x433c0f VMOVAPD %ZMM7,%ZMM4{%K1} |
0x433c15 VMOVAPD %ZMM5,%ZMM7 |
0x433c1b ADD %RAX,%R12 |
0x433c1e VMOVUPD (%R12,%RDI,8),%ZMM8{%K1}{z} |
0x433c25 VFMADD213PD %ZMM4,%ZMM6,%ZMM7 |
0x433c2b VMOVAPD %ZMM8,%ZMM3{%K1} |
0x433c31 VSUBPD %ZMM3,%ZMM7,%ZMM7 |
0x433c37 ADD 0x108(%RSP),%R9 |
0x433c3f VMOVUPD (%R9,%RDI,8),%ZMM8{%K1}{z} |
0x433c46 VMOVAPD %ZMM8,%ZMM2{%K1} |
0x433c4c VDIVPD %ZMM2,%ZMM7,%ZMM7 |
0x433c52 VMOVUPD %ZMM7,(%R11,%RDI,8){%K1} |
0x433c59 JMP 433c60 |
0x433c60 LEA 0x1(%R15),%EAX |
0x433c64 INC %R13D |
0x433c67 CMP %R14D,%R15D |
0x433c6a MOV %EAX,%R15D |
0x433c6d JE 4339d9 |
0x433c73 CMPL $0,0x80(%RSP) |
0x433c7b JS 433c60 |
0x433c7d MOV 0x78(%RSP),%RAX |
0x433c82 ADD %R15D,%EAX |
0x433c85 MOV 0x220(%RSP),%RCX |
0x433c8d MOV (%RCX),%RBX |
0x433c90 MOV 0x160(%RSP),%RCX |
0x433c98 MOV (%RCX),%RDI |
0x433c9b MOV 0x218(%RSP),%RCX |
0x433ca3 MOV (%RCX),%R12 |
0x433ca6 MOV 0x228(%RSP),%RCX |
0x433cae MOV (%RCX),%R10 |
0x433cb1 TEST %R8,%R8 |
0x433cb4 JE 433b80 |
0x433cba MOVSXD %R13D,%R14 |
0x433cbd MOV 0x20(%RSP),%RCX |
0x433cc2 ADD %R14,%RCX |
0x433cc5 ADD 0x38(%RSP),%R14 |
0x433cca MOVSXD %EAX,%R9 |
0x433ccd MOV 0x40(%RSP),%RAX |
0x433cd2 LEA 0x1(%R9,%RAX,1),%RDX |
0x433cd7 SUB 0x2c0(%RSP),%R9 |
0x433cdf MOV %RDI,%RAX |
0x433ce2 IMUL %R9,%RAX |
0x433ce6 MOV %RBX,%R11 |
0x433ce9 IMUL %R9,%R11 |
0x433ced IMUL %R12,%RDX |
0x433cf1 MOV %R12,%RSI |
0x433cf4 IMUL %R9,%RSI |
0x433cf8 MOV %RSI,0x90(%RSP) |
0x433d00 IMUL %R10,%R9 |
0x433d04 IMUL %RCX,%R10 |
0x433d08 ADD 0x28(%RSP),%R10 |
0x433d0d IMUL %R12,%R14 |
0x433d11 IMUL %RCX,%R12 |
0x433d15 MOV 0x50(%RSP),%RSI |
0x433d1a ADD %RSI,%R12 |
0x433d1d ADD %RSI,%R14 |
0x433d20 IMUL %RCX,%RBX |
0x433d24 ADD 0x30(%RSP),%RBX |
0x433d29 IMUL %RCX,%RDI |
0x433d2d ADD 0x168(%RSP),%RDI |
0x433d35 XOR %ECX,%ECX |
0x433d37 NOPW (%RAX,%RAX,1) |
(177) 0x433d40 VMOVUPD (%RDI,%RCX,8),%ZMM7 |
(177) 0x433d47 VMOVUPD (%RBX,%RCX,8),%ZMM8 |
(177) 0x433d4e VFMADD213PD (%R14,%RCX,8),%ZMM7,%ZMM8 |
(177) 0x433d55 VSUBPD (%R12,%RCX,8),%ZMM8,%ZMM7 |
(177) 0x433d5c VDIVPD (%R10,%RCX,8),%ZMM7,%ZMM7 |
(177) 0x433d63 VMOVUPD %ZMM7,(%RBX,%RCX,8) |
(177) 0x433d6a ADD $0x8,%RCX |
(177) 0x433d6e CMP %R8,%RCX |
(177) 0x433d71 JB 433d40 |
0x433d73 MOV %R8,%RDI |
0x433d76 CMP 0x1e0(%RSP),%R8 |
0x433d7e MOV 0x58(%RSP),%R14D |
0x433d83 MOV 0x90(%RSP),%R12 |
0x433d8b JNE 433bb3 |
0x433d91 JMP 433c60 |
/scratch_na/users/xoserete/qaas_runs/171-415-7919/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/advec_mom_kernel.f90: 81 - 248 |
-------------------------------------------------------------------------------- |
81: IF(mom_sweep.EQ.1)THEN ! x 1 |
[...] |
245: DO k=y_min,y_max+1 |
246: !$OMP SIMD |
247: DO j=x_min,x_max+1 |
248: vel1 (j,k)=(vel1(j,k)*node_mass_pre(j,k)+mom_flux(j,k-1)-mom_flux(j,k))/node_mass_post(j,k) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | __kmp_invoke_microtask | libiomp5.so | |
| ○ | __kmp_invoke_task_func | libiomp5.so |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.43 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.78 |
| Bottlenecks | |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:245-248 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 11.98 |
| CQA cycles if no scalar integer | 11.97 |
| CQA cycles if FP arith vectorized | 11.98 |
| CQA cycles if fully vectorized | 8.39 |
| Front-end cycles | 8.46 |
| DIV/SQRT cycles | 5.30 |
| P0 cycles | 10.26 |
| P1 cycles | 6.42 |
| P2 cycles | 6.42 |
| P3 cycles | 0.50 |
| P4 cycles | 5.33 |
| P5 cycles | 5.28 |
| P6 cycles | 0.50 |
| P7 cycles | 0.50 |
| P8 cycles | 0.50 |
| P9 cycles | 5.30 |
| P10 cycles | 6.42 |
| P11 cycles | 8.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 11.39 - 11.41 |
| Stall cycles (UFS) | 2.48 - 2.49 |
| Nb insns | 50.50 |
| Nb uops | 50.75 |
| Nb loads | 19.25 |
| Nb stores | 1.00 |
| Nb stack references | 13.75 |
| FLOP/cycle | 1.34 |
| Nb FLOP add-sub | 4.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 4.00 |
| Nb FLOP div | 4.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 21.22 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 288.00 |
| Bytes stored | 36.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 5.75 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 27.83 |
| Vectorization ratio load | 33.65 |
| Vectorization ratio store | 50.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 23.81 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 100.00 |
| Vectorization ratio other | 29.37 |
| Vector-efficiency ratio all | 34.73 |
| Vector-efficiency ratio load | 41.02 |
| Vector-efficiency ratio store | 56.25 |
| Vector-efficiency ratio mul | 12.50 |
| Vector-efficiency ratio add_sub | 30.80 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | 100.00 |
| Vector-efficiency ratio other | 35.67 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 16.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.03 |
| Bottlenecks | P0, P1, P5, P6, P10, |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:245-248 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 1.20 |
| CQA cycles if no scalar integer | 1.20 |
| CQA cycles if FP arith vectorized | 1.20 |
| CQA cycles if fully vectorized | 0.07 |
| Front-end cycles | 1.17 |
| DIV/SQRT cycles | 1.20 |
| P0 cycles | 1.20 |
| P1 cycles | 0.33 |
| P2 cycles | 0.33 |
| P3 cycles | 0.00 |
| P4 cycles | 1.20 |
| P5 cycles | 1.20 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 1.20 |
| P10 cycles | 0.33 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 1.32 - 1.34 |
| Stall cycles (UFS) | 0.00 |
| Nb insns | 7.00 |
| Nb uops | 7.00 |
| Nb loads | 1.00 |
| Nb stores | 0.00 |
| Nb stack references | 1.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 3.33 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 4.00 |
| Bytes stored | 0.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 0.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | NA |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 6.25 |
| Vector-efficiency ratio load | NA |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 6.25 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 6.25 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.63 |
| Bottlenecks | P0, |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:245-248 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 16.00 |
| CQA cycles if no scalar integer | 16.00 |
| CQA cycles if FP arith vectorized | 16.00 |
| CQA cycles if fully vectorized | 16.00 |
| Front-end cycles | 9.83 |
| DIV/SQRT cycles | 6.00 |
| P0 cycles | 9.10 |
| P1 cycles | 7.67 |
| P2 cycles | 7.67 |
| P3 cycles | 0.50 |
| P4 cycles | 6.00 |
| P5 cycles | 6.00 |
| P6 cycles | 0.50 |
| P7 cycles | 0.50 |
| P8 cycles | 0.50 |
| P9 cycles | 6.00 |
| P10 cycles | 7.67 |
| P11 cycles | 16.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 16.53 - 16.57 |
| Stall cycles (UFS) | 6.05 - 6.09 |
| Nb insns | 59.00 |
| Nb uops | 59.00 |
| Nb loads | 23.00 |
| Nb stores | 1.00 |
| Nb stack references | 14.00 |
| FLOP/cycle | 2.00 |
| Nb FLOP add-sub | 8.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 8.00 |
| Nb FLOP div | 8.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 32.75 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 460.00 |
| Bytes stored | 64.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 6.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 65.38 |
| Vectorization ratio load | 62.50 |
| Vectorization ratio store | 100.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 66.67 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 100.00 |
| Vectorization ratio other | 63.64 |
| Vector-efficiency ratio all | 68.99 |
| Vector-efficiency ratio load | 66.41 |
| Vector-efficiency ratio store | 100.00 |
| Vector-efficiency ratio mul | 12.50 |
| Vector-efficiency ratio add_sub | 68.75 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | 100.00 |
| Vector-efficiency ratio other | 67.05 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 9.75 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.60 |
| Bottlenecks | P1, |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:245-248 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 14.67 |
| CQA cycles if no scalar integer | 14.67 |
| CQA cycles if FP arith vectorized | 14.67 |
| CQA cycles if fully vectorized | 1.50 |
| Front-end cycles | 9.17 |
| DIV/SQRT cycles | 5.20 |
| P0 cycles | 14.67 |
| P1 cycles | 7.00 |
| P2 cycles | 7.00 |
| P3 cycles | 0.50 |
| P4 cycles | 5.40 |
| P5 cycles | 5.20 |
| P6 cycles | 0.50 |
| P7 cycles | 0.50 |
| P8 cycles | 0.50 |
| P9 cycles | 5.20 |
| P10 cycles | 7.00 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 11.18 |
| Stall cycles (UFS) | 1.52 |
| Nb insns | 55.00 |
| Nb uops | 55.00 |
| Nb loads | 21.00 |
| Nb stores | 1.00 |
| Nb stack references | 17.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 11.45 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 160.00 |
| Bytes stored | 8.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 6.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 11.67 |
| Vector-efficiency ratio load | 11.46 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 12.50 |
| Vector-efficiency ratio add_sub | 11.61 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 10.71 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | NA |
| Bottlenecks | P1, |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:245-248 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 16.07 |
| CQA cycles if no scalar integer | 16.00 |
| CQA cycles if FP arith vectorized | 16.07 |
| CQA cycles if fully vectorized | 16.00 |
| Front-end cycles | 13.67 |
| DIV/SQRT cycles | 8.80 |
| P0 cycles | 16.07 |
| P1 cycles | 10.67 |
| P2 cycles | 10.67 |
| P3 cycles | 1.00 |
| P4 cycles | 8.70 |
| P5 cycles | 8.70 |
| P6 cycles | 1.00 |
| P7 cycles | 1.00 |
| P8 cycles | 1.00 |
| P9 cycles | 8.80 |
| P10 cycles | 10.67 |
| P11 cycles | 16.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 16.54 - 16.56 |
| Stall cycles (UFS) | 2.33 - 2.35 |
| Nb insns | 81.00 |
| Nb uops | 82.00 |
| Nb loads | 32.00 |
| Nb stores | 2.00 |
| Nb stack references | 23.00 |
| FLOP/cycle | 1.99 |
| Nb FLOP add-sub | 8.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 8.00 |
| Nb FLOP div | 8.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 37.34 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 528.00 |
| Bytes stored | 72.00 |
| Stride 0 | 1.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 11.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 45.95 |
| Vectorization ratio load | 38.46 |
| Vectorization ratio store | 50.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 28.57 |
| Vectorization ratio fma | 100.00 |
| Vectorization ratio div_sqrt | 100.00 |
| Vectorization ratio other | 53.85 |
| Vector-efficiency ratio all | 52.03 |
| Vector-efficiency ratio load | 45.19 |
| Vector-efficiency ratio store | 56.25 |
| Vector-efficiency ratio mul | 12.50 |
| Vector-efficiency ratio add_sub | 36.61 |
| Vector-efficiency ratio fma | 100.00 |
| Vector-efficiency ratio div_sqrt | 100.00 |
| Vector-efficiency ratio other | 58.65 |
| Path / |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source file and lines | advec_mom_kernel.f90:81-248 |
| Module | exec |
| nb instructions | 50.50 |
| nb uops | 50.75 |
| loop length | 254.75 |
| used x86 registers | 12.25 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 4.50 |
| nb stack references | 13.75 |
| micro-operation queue | 8.46 cycles |
| front end | 8.46 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 5.30 | 7.30 | 6.42 | 6.42 | 0.50 | 5.33 | 5.28 | 0.50 | 0.50 | 0.50 | 5.30 | 6.42 |
| cycles | 5.30 | 10.26 | 6.42 | 6.42 | 0.50 | 5.33 | 5.28 | 0.50 | 0.50 | 0.50 | 5.30 | 6.42 |
| Cycles executing div or sqrt instructions | 8.00 |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 11.39-11.41 |
| Stall cycles | 2.48-2.49 |
| LM full (events) | 0.01-0.00 |
| PRF_INT full (events) | 3.26-3.27 |
| Front-end | 8.46 |
| Dispatch | 10.26 |
| DIV/SQRT | 8.00 |
| Data deps. | 0.00 |
| Overall L1 | 11.98 |
| all | 6% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 16% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 8% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 27% |
| load | 33% |
| store | 50% |
| mul | 0% |
| add-sub | 23% |
| fma | 100% |
| div/sqrt | 100% |
| other | 29% |
| all | 15% |
| load | 10% |
| store | 12% |
| mul | 12% |
| add-sub | 24% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 16% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 34% |
| load | 41% |
| store | 56% |
| mul | 12% |
| add-sub | 30% |
| fma | 100% |
| div/sqrt | 100% |
| other | 35% |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source file and lines | advec_mom_kernel.f90:81-248 |
| Module | exec |
| nb instructions | 7 |
| nb uops | 7 |
| loop length | 29 |
| used x86 registers | 5 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 1 |
| micro-operation queue | 1.17 cycles |
| front end | 1.17 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.20 | 1.20 | 0.33 | 0.33 | 0.00 | 1.20 | 1.20 | 0.00 | 0.00 | 0.00 | 1.20 | 0.33 |
| cycles | 1.20 | 1.20 | 0.33 | 0.33 | 0.00 | 1.20 | 1.20 | 0.00 | 0.00 | 0.00 | 1.20 | 0.33 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 1.32-1.34 |
| Stall cycles | 0.00 |
| Front-end | 1.17 |
| Dispatch | 1.20 |
| Data deps. | 0.00 |
| Overall L1 | 1.20 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 6% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 6% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LEA 0x1(%R15),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| CMP %R14D,%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4339d9 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1cf9> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| CMPL $0,0x80(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| JS 433c60 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1f80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source file and lines | advec_mom_kernel.f90:81-248 |
| Module | exec |
| nb instructions | 59 |
| nb uops | 59 |
| loop length | 309 |
| used x86 registers | 14 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 9 |
| nb stack references | 14 |
| micro-operation queue | 9.83 cycles |
| front end | 9.83 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 6.00 | 6.00 | 7.67 | 7.67 | 0.50 | 6.00 | 6.00 | 0.50 | 0.50 | 0.50 | 6.00 | 7.67 |
| cycles | 6.00 | 9.10 | 7.67 | 7.67 | 0.50 | 6.00 | 6.00 | 0.50 | 0.50 | 0.50 | 6.00 | 7.67 |
| Cycles executing div or sqrt instructions | 16.00 |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 16.53-16.57 |
| Stall cycles | 6.05-6.09 |
| LM full (events) | 0.02-0.01 |
| PRF_INT full (events) | 6.38-6.42 |
| Front-end | 9.83 |
| Dispatch | 9.10 |
| DIV/SQRT | 16.00 |
| Data deps. | 0.00 |
| Overall L1 | 16.00 |
| all | 18% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 50% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 20% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 65% |
| load | 62% |
| store | 100% |
| mul | 0% |
| add-sub | 66% |
| fma | 100% |
| div/sqrt | 100% |
| other | 63% |
| all | 26% |
| load | 10% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 12% |
| add-sub | 53% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 27% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 68% |
| load | 66% |
| store | 100% |
| mul | 12% |
| add-sub | 68% |
| fma | 100% |
| div/sqrt | 100% |
| other | 67% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CLTQ | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| MOV 0x40(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| LEA 0x1(%RAX,%RCX,1),%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| SUB 0x2c0(%RSP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| IMUL %RAX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %RAX,%RBX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %R12,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %RAX,%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %RAX,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %R10,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %RBX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VPBROADCASTQ %RDI,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSUBQ %ZMM7,%ZMM0,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
| VPCMPNLEUQ %ZMM1,%ZMM7,%K1 | |||||||||||||||
| ADD 0xe0(%RSP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| ADD 0x18(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| SUB 0x98(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVUPD (%RAX,%RDI,8),%ZMM7{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM7,%ZMM6{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| ADD 0x130(%RSP),%R11 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVUPD (%R11,%RDI,8),%ZMM7{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM7,%ZMM5{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| MOV 0x148(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVUPD (%RDX,%RDI,8),%ZMM7{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM7,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VMOVAPD %ZMM5,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| ADD %RAX,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVUPD (%R12,%RDI,8),%ZMM8{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VFMADD213PD %ZMM4,%ZMM6,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM8,%ZMM3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VSUBPD %ZMM3,%ZMM7,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| ADD 0x108(%RSP),%R9 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVUPD (%R9,%RDI,8),%ZMM8{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM8,%ZMM2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VDIVPD %ZMM2,%ZMM7,%ZMM7 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
| VMOVUPD %ZMM7,(%R11,%RDI,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
| JMP 433c60 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1f80> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| LEA 0x1(%R15),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| CMP %R14D,%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4339d9 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1cf9> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| CMPL $0,0x80(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| JS 433c60 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1f80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV 0x78(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %R15D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV 0x220(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x160(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x218(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x228(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| TEST %R8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JE 433b80 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1ea0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source file and lines | advec_mom_kernel.f90:81-248 |
| Module | exec |
| nb instructions | 55 |
| nb uops | 55 |
| loop length | 259 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 17 |
| micro-operation queue | 9.17 cycles |
| front end | 9.17 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 5.20 | 11.00 | 7.00 | 7.00 | 0.50 | 5.40 | 5.20 | 0.50 | 0.50 | 0.50 | 5.20 | 7.00 |
| cycles | 5.20 | 14.67 | 7.00 | 7.00 | 0.50 | 5.40 | 5.20 | 0.50 | 0.50 | 0.50 | 5.20 | 7.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 11.18 |
| Stall cycles | 1.52 |
| PRF_INT full (events) | 4.15 |
| Front-end | 9.17 |
| Dispatch | 14.67 |
| Data deps. | 0.00 |
| Overall L1 | 14.67 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 11% |
| load | 11% |
| store | 12% |
| mul | 12% |
| add-sub | 11% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LEA 0x1(%R15),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| CMP %R14D,%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4339d9 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1cf9> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| CMPL $0,0x80(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| JS 433c60 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1f80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV 0x78(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %R15D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV 0x220(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x160(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x218(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x228(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| TEST %R8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JE 433b80 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1ea0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOVSXD %R13D,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| MOV 0x20(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %R14,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD 0x38(%RSP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| MOVSXD %EAX,%R9 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| MOV 0x40(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| LEA 0x1(%R9,%RAX,1),%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| SUB 0x2c0(%RSP),%R9 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %R9,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RBX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %R9,%R11 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %R12,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %R12,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %R9,%RSI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RSI,0x90(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| IMUL %R10,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %RCX,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD 0x28(%RSP),%R10 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| IMUL %R12,%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %RCX,%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %RSI,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RSI,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| IMUL %RCX,%RBX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD 0x30(%RSP),%RBX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD 0x168(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| MOV %R8,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP 0x1e0(%RSP),%R8 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| MOV 0x58(%RSP),%R14D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x90(%RSP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| JNE 433bb3 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1ed3> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| JMP 433c60 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1f80> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
| Source file and lines | advec_mom_kernel.f90:81-248 |
| Module | exec |
| nb instructions | 81 |
| nb uops | 82 |
| loop length | 422 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 9 |
| nb stack references | 23 |
| micro-operation queue | 13.67 cycles |
| front end | 13.67 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 8.80 | 11.00 | 10.67 | 10.67 | 1.00 | 8.70 | 8.70 | 1.00 | 1.00 | 1.00 | 8.80 | 10.67 |
| cycles | 8.80 | 16.07 | 10.67 | 10.67 | 1.00 | 8.70 | 8.70 | 1.00 | 1.00 | 1.00 | 8.80 | 10.67 |
| Cycles executing div or sqrt instructions | 16.00 |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 16.54-16.56 |
| Stall cycles | 2.33-2.35 |
| PRF_INT full (events) | 2.50-2.52 |
| Front-end | 13.67 |
| Dispatch | 16.07 |
| DIV/SQRT | 16.00 |
| Data deps. | 0.00 |
| Overall L1 | 16.07 |
| all | 9% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 16% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 14% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 45% |
| load | 38% |
| store | 50% |
| mul | 0% |
| add-sub | 28% |
| fma | 100% |
| div/sqrt | 100% |
| other | 53% |
| all | 19% |
| load | 10% |
| store | 12% |
| mul | 12% |
| add-sub | 26% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 23% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 100% |
| fma | 100% |
| div/sqrt | 100% |
| other | 100% |
| all | 52% |
| load | 45% |
| store | 56% |
| mul | 12% |
| add-sub | 36% |
| fma | 100% |
| div/sqrt | 100% |
| other | 58% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VPBROADCASTQ %RDI,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSUBQ %ZMM7,%ZMM0,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
| VPCMPNLEUQ %ZMM1,%ZMM7,%K1 | |||||||||||||||
| ADD 0xe0(%RSP),%RAX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| ADD 0x18(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| SUB 0x98(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVUPD (%RAX,%RDI,8),%ZMM7{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM7,%ZMM6{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| ADD 0x130(%RSP),%R11 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVUPD (%R11,%RDI,8),%ZMM7{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM7,%ZMM5{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| MOV 0x148(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVUPD (%RDX,%RDI,8),%ZMM7{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM7,%ZMM4{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VMOVAPD %ZMM5,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| ADD %RAX,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVUPD (%R12,%RDI,8),%ZMM8{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VFMADD213PD %ZMM4,%ZMM6,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMOVAPD %ZMM8,%ZMM3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VSUBPD %ZMM3,%ZMM7,%ZMM7 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| ADD 0x108(%RSP),%R9 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| VMOVUPD (%R9,%RDI,8),%ZMM8{%K1}{z} | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
| VMOVAPD %ZMM8,%ZMM2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VDIVPD %ZMM2,%ZMM7,%ZMM7 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
| VMOVUPD %ZMM7,(%R11,%RDI,8){%K1} | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
| JMP 433c60 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1f80> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| LEA 0x1(%R15),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| CMP %R14D,%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4339d9 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1cf9> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| CMPL $0,0x80(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| JS 433c60 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1f80> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV 0x78(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %R15D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV 0x220(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x160(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x218(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x228(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RCX),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| TEST %R8,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JE 433b80 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1ea0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOVSXD %R13D,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| MOV 0x20(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %R14,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD 0x38(%RSP),%R14 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| MOVSXD %EAX,%R9 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| MOV 0x40(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| LEA 0x1(%R9,%RAX,1),%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| SUB 0x2c0(%RSP),%R9 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %R9,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RBX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %R9,%R11 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %R12,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %R12,%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %R9,%RSI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RSI,0x90(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| IMUL %R10,%R9 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %RCX,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD 0x28(%RSP),%R10 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| IMUL %R12,%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %RCX,%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV 0x50(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %RSI,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RSI,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| IMUL %RCX,%RBX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD 0x30(%RSP),%RBX | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| IMUL %RCX,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD 0x168(%RSP),%RDI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| MOV %R8,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP 0x1e0(%RSP),%R8 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
| MOV 0x58(%RSP),%R14D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x90(%RSP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| JNE 433bb3 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x1ed3> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
