| Loop Id: 134 | Module: exec | Source: advec_mom_kernel.f90:172-240 [...] | Coverage: 3.77% |
|---|
| Loop Id: 134 | Module: exec | Source: advec_mom_kernel.f90:172-240 [...] | Coverage: 3.77% |
|---|
0x433e70 MOVSXD 0x1c8(%RSP),%RBX [10] |
0x433e78 MOV 0x1d8(%RSP),%RDX [10] |
0x433e80 MOV %R12,%RDI |
0x433e83 MOV %EBX,%EAX |
0x433e85 MOV 0x1f8(%RSP),%R14 [10] |
0x433e8d MOV 0x1e8(%RSP),%RSI [10] |
0x433e95 CLTQ |
0x433e97 IMUL %R11,%RDI |
0x433e9b IMUL %R11,%RAX |
0x433e9f VMOVSD (%R15),%XMM3 [5] |
0x433ea4 IMUL %RDX,%R14 |
0x433ea8 ADD %RCX,%RSI |
0x433eab IMUL %R11,%RDX |
0x433eaf ADD %R14,%RSI |
0x433eb2 MOV 0x1d0(%RSP),%R14 [10] |
0x433eba VMOVSD (%R14,%RSI,8),%XMM6 [8] |
0x433ec0 MOV 0x1e0(%RSP),%RSI [10] |
0x433ec8 ADD %RCX,%RSI |
0x433ecb ADD %RSI,%RDX |
0x433ece ADD %RSI,%RAX |
0x433ed1 ADD %RDI,%RSI |
0x433ed4 VMOVSD (%R10,%RDX,8),%XMM1 [4] |
0x433eda VMOVSD (%R10,%RSI,8),%XMM7 [9] |
0x433ee0 VSUBSD (%R10,%RAX,8),%XMM1,%XMM2 [1] |
0x433ee6 VSUBSD %XMM1,%XMM7,%XMM0 |
0x433eea VMULSD %XMM2,%XMM0,%XMM14 |
0x433eee VCOMISD %XMM15,%XMM14 |
0x433ef3 JBE 433f7d |
0x433ef9 VCOMISD %XMM0,%XMM15 |
0x433efd JAE 435ad8 |
0x433f03 VMOVSD %XMM10,%XMM10,%XMM5 |
0x433f07 VMOVSD %XMM10,%XMM10,%XMM8 |
0x433f0c VUNPCKLPD %XMM2,%XMM0,%XMM7 |
0x433f10 VANDPD %XMM12,%XMM4,%XMM14 |
0x433f15 VMOVDDUP 0x65b33(%RIP),%XMM2 [3] |
0x433f1d MOV 0x1b0(%RSP),%RDX [10] |
0x433f25 MOV 0x1c0(%RSP),%RDI [10] |
0x433f2d VANDPD %XMM2,%XMM7,%XMM0 |
0x433f31 VDIVSD %XMM6,%XMM14,%XMM7 |
0x433f35 ADD %RDX,%RBX |
0x433f38 VMOVDDUP %XMM7,%XMM6 |
0x433f3c VSUBSD %XMM7,%XMM5,%XMM5 |
0x433f40 VADDSUBPD %XMM6,%XMM9,%XMM2 |
0x433f44 VMOVHPD (%RDI,%RBX,8),%XMM3,%XMM6 [6] |
0x433f49 VMULSD %XMM11,%XMM3,%XMM3 |
0x433f4e VMULSD %XMM8,%XMM5,%XMM8 |
0x433f53 VMULPD %XMM0,%XMM2,%XMM14 |
0x433f57 VDIVPD %XMM6,%XMM14,%XMM2 |
0x433f5b VUNPCKHPD %XMM2,%XMM2,%XMM14 |
0x433f5f VADDPD %XMM2,%XMM14,%XMM6 |
0x433f63 VMOVSD %XMM0,%XMM0,%XMM2 |
0x433f67 VUNPCKHPD %XMM0,%XMM0,%XMM0 |
0x433f6b VMINSD %XMM0,%XMM2,%XMM14 |
0x433f6f VMULSD %XMM6,%XMM3,%XMM7 |
0x433f73 VMINSD %XMM14,%XMM7,%XMM6 |
0x433f78 VFMADD231SD %XMM6,%XMM8,%XMM1 |
0x433f7d VMULSD %XMM4,%XMM1,%XMM4 |
0x433f81 MOV 0x1f0(%RSP),%RBX [10] |
0x433f89 VMOVSD %XMM4,(%R9,%RCX,8) [7] |
0x433f8f MOV %R8,%RCX |
0x433f92 CMP %RBX,%R8 |
0x433f95 JE 433fd0 |
0x433f97 INC %R8 |
0x433f9a VMOVSD (%R13,%RCX,8),%XMM4 [2] |
0x433fa1 VCOMISD %XMM4,%XMM15 |
0x433fa5 JBE 433e70 |
0x433fab MOV 0x1a0(%RSP),%EAX [10] |
0x433fb2 MOV 0x1d8(%RSP),%RDI [10] |
0x433fba MOV %R12,%RDX |
0x433fbd MOVSXD 0x1b8(%RSP),%RBX [10] |
0x433fc5 INC %EAX |
0x433fc7 JMP 433e85 |
0x435ad8 MOV 0x645c9(%RIP),%RAX [3] |
0x435adf VMOVSD %XMM13,%XMM13,%XMM8 |
0x435ae4 VMOVQ %RAX,%XMM5 |
0x435ae9 JMP 433f0c |
/home/eoseret/qaas_runs_CPU_9468/171-152-3172/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/advec_mom_kernel.f90: 172 - 240 |
-------------------------------------------------------------------------------- |
172: IF(vdiffdw.LE.0.0) wind=-1.0_8 |
[...] |
214: DO j=x_min,x_max+1 |
215: IF(node_flux(j,k).LT.0.0)THEN |
216: upwind=k+2 |
217: donor=k+1 |
218: downwind=k |
219: dif=donor |
220: ELSE |
221: upwind=k-1 |
[...] |
227: sigma=ABS(node_flux(j,k))/(node_mass_pre(j,donor)) |
228: width=celldy(k) |
229: vdiffuw=vel1(j,donor)-vel1(j,upwind) |
230: vdiffdw=vel1(j,downwind)-vel1(j,donor) |
231: limiter=0.0 |
232: IF(vdiffuw*vdiffdw.GT.0.0)THEN |
233: auw=ABS(vdiffuw) |
234: adw=ABS(vdiffdw) |
235: wind=1.0_8 |
236: IF(vdiffdw.LE.0.0) wind=-1.0_8 |
237: limiter=wind*MIN(width*((2.0_8-sigma)*adw/width+(1.0_8+sigma)*auw/celldy(dif))/6.0_8,auw,adw) |
238: ENDIF |
239: advec_vel_s=vel1(j,donor)+(1.0_8-sigma)*limiter |
240: mom_flux(j,k)=advec_vel_s*node_flux(j,k) |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.24 |
| CQA speedup if FP arith vectorized | 2.17 |
| CQA speedup if fully vectorized | 2.17 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.08 |
| Bottlenecks | micro-operation queue, |
| Function | advec_mom_kernel._omp_fn.0 |
| Source | advec_mom_kernel.f90:172-172,advec_mom_kernel.f90:214-221,advec_mom_kernel.f90:227-240 |
| Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
| Source loop unroll confidence level | max |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 13.00 |
| CQA cycles if no scalar integer | 10.50 |
| CQA cycles if FP arith vectorized | 6.00 |
| CQA cycles if fully vectorized | 6.00 |
| Front-end cycles | 13.00 |
| DIV/SQRT cycles | 12.00 |
| P0 cycles | 12.00 |
| P1 cycles | 7.00 |
| P2 cycles | 7.00 |
| P3 cycles | 0.50 |
| P4 cycles | 12.00 |
| P5 cycles | 6.60 |
| P6 cycles | 0.50 |
| P7 cycles | 0.50 |
| P8 cycles | 0.50 |
| P9 cycles | 6.40 |
| P10 cycles | 7.00 |
| P11 cycles | 8.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | 13.75 - 13.90 |
| Stall cycles (UFS) | 0.03 - 0.13 |
| Nb insns | 76.00 |
| Nb uops | 76.00 |
| Nb loads | 21.00 |
| Nb stores | 1.00 |
| Nb stack references | 11.00 |
| FLOP/cycle | 1.46 |
| Nb FLOP add-sub | 7.00 |
| Nb FLOP mul | 7.00 |
| Nb FLOP fma | 1.00 |
| Nb FLOP div | 3.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 12.62 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 156.00 |
| Bytes stored | 8.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 14.63 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 16.67 |
| Vectorization ratio add_sub | 33.33 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | 50.00 |
| Vectorization ratio other | 11.11 |
| Vector-efficiency ratio all | 14.33 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 14.58 |
| Vector-efficiency ratio add_sub | 16.67 |
| Vector-efficiency ratio fma | 12.50 |
| Vector-efficiency ratio div_sqrt | 18.75 |
| Vector-efficiency ratio other | 13.89 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.24 |
| CQA speedup if FP arith vectorized | 2.17 |
| CQA speedup if fully vectorized | 2.17 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.08 |
| Bottlenecks | micro-operation queue, |
| Function | advec_mom_kernel._omp_fn.0 |
| Source | advec_mom_kernel.f90:172-172,advec_mom_kernel.f90:214-221,advec_mom_kernel.f90:227-240 |
| Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
| Source loop unroll confidence level | max |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 13.00 |
| CQA cycles if no scalar integer | 10.50 |
| CQA cycles if FP arith vectorized | 6.00 |
| CQA cycles if fully vectorized | 6.00 |
| Front-end cycles | 13.00 |
| DIV/SQRT cycles | 12.00 |
| P0 cycles | 12.00 |
| P1 cycles | 7.00 |
| P2 cycles | 7.00 |
| P3 cycles | 0.50 |
| P4 cycles | 12.00 |
| P5 cycles | 6.60 |
| P6 cycles | 0.50 |
| P7 cycles | 0.50 |
| P8 cycles | 0.50 |
| P9 cycles | 6.40 |
| P10 cycles | 7.00 |
| P11 cycles | 8.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | 13.75 - 13.90 |
| Stall cycles (UFS) | 0.03 - 0.13 |
| Nb insns | 76.00 |
| Nb uops | 76.00 |
| Nb loads | 21.00 |
| Nb stores | 1.00 |
| Nb stack references | 11.00 |
| FLOP/cycle | 1.46 |
| Nb FLOP add-sub | 7.00 |
| Nb FLOP mul | 7.00 |
| Nb FLOP fma | 1.00 |
| Nb FLOP div | 3.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 12.62 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 156.00 |
| Bytes stored | 8.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 14.63 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 16.67 |
| Vectorization ratio add_sub | 33.33 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | 50.00 |
| Vectorization ratio other | 11.11 |
| Vector-efficiency ratio all | 14.33 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 14.58 |
| Vector-efficiency ratio add_sub | 16.67 |
| Vector-efficiency ratio fma | 12.50 |
| Vector-efficiency ratio div_sqrt | 18.75 |
| Vector-efficiency ratio other | 13.89 |
| Path / |
| Function | advec_mom_kernel._omp_fn.0 |
| Source file and lines | advec_mom_kernel.f90:172-240 |
| Module | exec |
| nb instructions | 76 |
| nb uops | 76 |
| loop length | 370 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 16 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 11 |
| ADD-SUB / MUL ratio | 0.83 |
| micro-operation queue | 13.00 cycles |
| front end | 13.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 12.00 | 12.00 | 7.00 | 7.00 | 0.50 | 12.00 | 6.60 | 0.50 | 0.50 | 0.50 | 6.40 | 7.00 |
| cycles | 12.00 | 12.00 | 7.00 | 7.00 | 0.50 | 12.00 | 6.60 | 0.50 | 0.50 | 0.50 | 6.40 | 7.00 |
| Cycles executing div or sqrt instructions | 8.00 |
| FE+BE cycles | 13.75-13.90 |
| Stall cycles | 0.03-0.13 |
| ROB full (events) | 0.03-0.14 |
| Front-end | 13.00 |
| Dispatch | 12.00 |
| DIV/SQRT | 8.00 |
| Overall L1 | 13.00 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 16% |
| load | 0% |
| store | 0% |
| mul | 16% |
| add-sub | 40% |
| fma | 0% |
| div/sqrt | 50% |
| other | 13% |
| all | 14% |
| load | 0% |
| store | 0% |
| mul | 16% |
| add-sub | 33% |
| fma | 0% |
| div/sqrt | 50% |
| other | 11% |
| all | 12% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | 14% |
| add-sub | 17% |
| fma | 12% |
| div/sqrt | 18% |
| other | 14% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | 14% |
| add-sub | 16% |
| fma | 12% |
| div/sqrt | 18% |
| other | 13% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOVSXD 0x1c8(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1d8(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R12,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %EBX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV 0x1f8(%RSP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1e8(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| CLTQ | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| IMUL %R11,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %R11,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VMOVSD (%R15),%XMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| IMUL %RDX,%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| IMUL %R11,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %R14,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV 0x1d0(%RSP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD (%R14,%RSI,8),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1e0(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RSI,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RSI,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RDI,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVSD (%R10,%RDX,8),%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD (%R10,%RSI,8),%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VSUBSD (%R10,%RAX,8),%XMM1,%XMM2 | 1 | 0 | 0.50 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.50 |
| VSUBSD %XMM1,%XMM7,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMULSD %XMM2,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VCOMISD %XMM15,%XMM14 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| JBE 433f7d <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1d5d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VCOMISD %XMM0,%XMM15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| JAE 435ad8 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x38b8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VMOVSD %XMM10,%XMM10,%XMM5 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VMOVSD %XMM10,%XMM10,%XMM8 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VUNPCKLPD %XMM2,%XMM0,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VANDPD %XMM12,%XMM4,%XMM14 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VMOVDDUP 0x65b33(%RIP),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1b0(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1c0(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VANDPD %XMM2,%XMM7,%XMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VDIVSD %XMM6,%XMM14,%XMM7 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
| ADD %RDX,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVDDUP %XMM7,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VSUBSD %XMM7,%XMM5,%XMM5 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VADDSUBPD %XMM6,%XMM9,%XMM2 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVHPD (%RDI,%RBX,8),%XMM3,%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
| VMULSD %XMM11,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULSD %XMM8,%XMM5,%XMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULPD %XMM0,%XMM2,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VDIVPD %XMM6,%XMM14,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
| VUNPCKHPD %XMM2,%XMM2,%XMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VADDPD %XMM2,%XMM14,%XMM6 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVSD %XMM0,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VUNPCKHPD %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VMINSD %XMM0,%XMM2,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULSD %XMM6,%XMM3,%XMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMINSD %XMM14,%XMM7,%XMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VFMADD231SD %XMM6,%XMM8,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULSD %XMM4,%XMM1,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| MOV 0x1f0(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD %XMM4,(%R9,%RCX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| MOV %R8,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP %RBX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JE 433fd0 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1db0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| INC %R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| VMOVSD (%R13,%RCX,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VCOMISD %XMM4,%XMM15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| JBE 433e70 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1c50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV 0x1a0(%RSP),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1d8(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R12,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOVSXD 0x1b8(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JMP 433e85 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1c65> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| MOV 0x645c9(%RIP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD %XMM13,%XMM13,%XMM8 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VMOVQ %RAX,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| JMP 433f0c <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1cec> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| Function | advec_mom_kernel._omp_fn.0 |
| Source file and lines | advec_mom_kernel.f90:172-240 |
| Module | exec |
| nb instructions | 76 |
| nb uops | 76 |
| loop length | 370 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 16 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 11 |
| ADD-SUB / MUL ratio | 0.83 |
| micro-operation queue | 13.00 cycles |
| front end | 13.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 12.00 | 12.00 | 7.00 | 7.00 | 0.50 | 12.00 | 6.60 | 0.50 | 0.50 | 0.50 | 6.40 | 7.00 |
| cycles | 12.00 | 12.00 | 7.00 | 7.00 | 0.50 | 12.00 | 6.60 | 0.50 | 0.50 | 0.50 | 6.40 | 7.00 |
| Cycles executing div or sqrt instructions | 8.00 |
| FE+BE cycles | 13.75-13.90 |
| Stall cycles | 0.03-0.13 |
| ROB full (events) | 0.03-0.14 |
| Front-end | 13.00 |
| Dispatch | 12.00 |
| DIV/SQRT | 8.00 |
| Overall L1 | 13.00 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 0% |
| all | 16% |
| load | 0% |
| store | 0% |
| mul | 16% |
| add-sub | 40% |
| fma | 0% |
| div/sqrt | 50% |
| other | 13% |
| all | 14% |
| load | 0% |
| store | 0% |
| mul | 16% |
| add-sub | 33% |
| fma | 0% |
| div/sqrt | 50% |
| other | 11% |
| all | 12% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | 14% |
| add-sub | 17% |
| fma | 12% |
| div/sqrt | 18% |
| other | 14% |
| all | 14% |
| load | 12% |
| store | 12% |
| mul | 14% |
| add-sub | 16% |
| fma | 12% |
| div/sqrt | 18% |
| other | 13% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOVSXD 0x1c8(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1d8(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R12,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %EBX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV 0x1f8(%RSP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1e8(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| CLTQ | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| IMUL %R11,%RDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %R11,%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VMOVSD (%R15),%XMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| IMUL %RDX,%R14 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| IMUL %R11,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %R14,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV 0x1d0(%RSP),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD (%R14,%RSI,8),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1e0(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RSI,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RSI,%RAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| ADD %RDI,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVSD (%R10,%RDX,8),%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD (%R10,%RSI,8),%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VSUBSD (%R10,%RAX,8),%XMM1,%XMM2 | 1 | 0 | 0.50 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.50 |
| VSUBSD %XMM1,%XMM7,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMULSD %XMM2,%XMM0,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VCOMISD %XMM15,%XMM14 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| JBE 433f7d <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1d5d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VCOMISD %XMM0,%XMM15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| JAE 435ad8 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x38b8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VMOVSD %XMM10,%XMM10,%XMM5 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VMOVSD %XMM10,%XMM10,%XMM8 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VUNPCKLPD %XMM2,%XMM0,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VANDPD %XMM12,%XMM4,%XMM14 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VMOVDDUP 0x65b33(%RIP),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1b0(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1c0(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VANDPD %XMM2,%XMM7,%XMM0 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VDIVSD %XMM6,%XMM14,%XMM7 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
| ADD %RDX,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VMOVDDUP %XMM7,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VSUBSD %XMM7,%XMM5,%XMM5 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VADDSUBPD %XMM6,%XMM9,%XMM2 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVHPD (%RDI,%RBX,8),%XMM3,%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
| VMULSD %XMM11,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULSD %XMM8,%XMM5,%XMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULPD %XMM0,%XMM2,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VDIVPD %XMM6,%XMM14,%XMM2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
| VUNPCKHPD %XMM2,%XMM2,%XMM14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VADDPD %XMM2,%XMM14,%XMM6 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
| VMOVSD %XMM0,%XMM0,%XMM2 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VUNPCKHPD %XMM0,%XMM0,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| VMINSD %XMM0,%XMM2,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULSD %XMM6,%XMM3,%XMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMINSD %XMM14,%XMM7,%XMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VFMADD231SD %XMM6,%XMM8,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| VMULSD %XMM4,%XMM1,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
| MOV 0x1f0(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD %XMM4,(%R9,%RCX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| MOV %R8,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP %RBX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JE 433fd0 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1db0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| INC %R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| VMOVSD (%R13,%RCX,8),%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VCOMISD %XMM4,%XMM15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| JBE 433e70 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1c50> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV 0x1a0(%RSP),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x1d8(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %R12,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOVSXD 0x1b8(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JMP 433e85 <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1c65> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| MOV 0x645c9(%RIP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VMOVSD %XMM13,%XMM13,%XMM8 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| VMOVQ %RAX,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| JMP 433f0c <__advec_mom_kernel_mod_MOD_advec_mom_kernel._omp_fn.0+0x1cec> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
