| Loop Id: 438 | Module: exec | Source: pack_kernel.f90:109-113 | Coverage: 0.01% |
|---|
| Loop Id: 438 | Module: exec | Source: pack_kernel.f90:109-113 | Coverage: 0.01% |
|---|
0x449140 MOV %ESI,%R10D |
0x449143 LEA 0x1(%R8),%EAX |
0x449147 INC %EDI |
0x449149 MOV %R10D,%ESI |
0x44914c CMP %EDX,%R8D |
0x44914f MOV %EAX,%R8D |
0x449152 JE 4490b6 |
0x449158 TEST %ESI,%ESI |
0x44915a JLE 449140 |
0x44915c MOV -0x68(%RBP),%RAX |
0x449160 LEA (%RAX,%R8,1),%R11D |
0x449164 MOV -0x70(%RBP),%RAX |
0x449168 MOVSXD (%RAX),%RCX |
0x44916b MOV -0x58(%RBP),%RAX |
0x44916f MOV (%RAX),%R10D |
0x449172 MOV 0x10aa1f(%RIP),%R9 |
0x449179 MOV 0x10aa50(%RIP),%R13 |
0x449180 MOV 0x10(%RBP),%RBX |
0x449184 MOV (%RBX),%R15 |
0x449187 MOV 0x38(%RBX),%RAX |
0x44918b MOV %RAX,-0x40(%RBP) |
0x44918f MOV 0x18(%RBP),%RDX |
0x449193 MOVSXD (%RDX),%RAX |
0x449196 MOV 0x50(%RBX),%RDX |
0x44919a MOV %ESI,%EBX |
0x44919c MOV %RBX,%R12 |
0x44919f MOVSXD %R11D,%R14 |
0x4491a2 MOV $-0x8,%ESI |
0x4491a7 AND %RSI,%R12 |
0x4491aa JE 449280 |
0x4491b0 LEA (%R10,%RDI,1),%ESI |
0x4491b4 IMUL %R10D,%ESI |
0x4491b8 MOVSXD %ESI,%RSI |
0x4491bb ADD %RCX,%RSI |
0x4491be LEA -0x1(%R11,%R10,1),%R11D |
0x4491c3 MOV %R10,-0x60(%RBP) |
0x4491c7 IMUL %R10D,%R11D |
0x4491cb MOVSXD %R11D,%R11 |
0x4491ce ADD %RCX,%R11 |
0x4491d1 VPBROADCASTQ %R13,%ZMM5 |
0x4491d7 MOV -0x40(%RBP),%RCX |
0x4491db VPBROADCASTQ %RCX,%ZMM4 |
0x4491e1 MOV %RAX,-0x40(%RBP) |
0x4491e5 XOR %ECX,%ECX |
0x4491e7 NOPW (%RAX,%RAX,1) |
(439) 0x4491f0 LEA (%RSI,%RCX,1),%R13 |
(439) 0x4491f4 VPBROADCASTQ %R13,%ZMM6 |
(439) 0x4491fa VPADDQ %ZMM2,%ZMM6,%ZMM6 |
(439) 0x449200 VPMULLQ %ZMM6,%ZMM5,%ZMM6 |
(439) 0x449206 VXORPD %XMM7,%XMM7,%XMM7 |
(439) 0x44920a KXNORW %K0,%K0,%K1 |
(439) 0x44920e VGATHERQPD (%R9,%ZMM6,1),%ZMM7{%K1} |
(439) 0x449215 LEA 0x1(%R14),%R13 |
(439) 0x449219 IMUL %RDX,%R13 |
(439) 0x44921d VPBROADCASTQ %RAX,%ZMM6 |
(439) 0x449223 VPADDQ %ZMM1,%ZMM6,%ZMM6 |
(439) 0x449229 VPMULLQ %ZMM6,%ZMM4,%ZMM6 |
(439) 0x44922f LEA (%R15,%R13,1),%R10 |
(439) 0x449233 KXNORW %K0,%K0,%K1 |
(439) 0x449237 VSCATTERQPD %ZMM7,(%R10,%ZMM6,1){%K1} |
(439) 0x44923e ADD $0x8,%RCX |
(439) 0x449242 ADD $-0x8,%RAX |
(439) 0x449246 CMP %R12,%RCX |
(439) 0x449249 JB 4491f0 |
0x44924b CMP %RBX,%R12 |
0x44924e JNE 449300 |
0x449254 MOV -0x2c(%RBP),%EDX |
0x449257 MOV -0x60(%RBP),%R10 |
0x44925b JMP 449143 |
0x449280 VPBROADCASTQ %RBX,%ZMM8 |
0x449286 VPBROADCASTQ %R9,%ZMM6 |
0x44928c LEA -0x1(%R11,%R10,1),%ESI |
0x449291 IMUL %R10D,%ESI |
0x449295 MOVSXD %ESI,%R11 |
0x449298 ADD %RCX,%R11 |
0x44929b VPBROADCASTQ %R13,%ZMM5 |
0x4492a1 VPBROADCASTQ %R15,%ZMM7 |
0x4492a7 INC %R14 |
0x4492aa IMUL %R14,%RDX |
0x4492ae VPBROADCASTQ %RDX,%ZMM9 |
0x4492b4 VPBROADCASTQ %RAX,%ZMM10 |
0x4492ba MOV -0x40(%RBP),%RAX |
0x4492be VPBROADCASTQ %RAX,%ZMM4 |
0x4492c4 XOR %R12D,%R12D |
0x4492c7 MOV -0x2c(%RBP),%EDX |
0x4492ca JMP 449329 |
0x449300 VPBROADCASTQ %R13,%ZMM9 |
0x449306 VPBROADCASTQ %R15,%ZMM7 |
0x44930c VPBROADCASTQ %R9,%ZMM6 |
0x449312 MOV -0x40(%RBP),%RAX |
0x449316 VPBROADCASTQ %RAX,%ZMM10 |
0x44931c VPBROADCASTQ %RBX,%ZMM8 |
0x449322 MOV -0x2c(%RBP),%EDX |
0x449325 MOV -0x60(%RBP),%R10 |
0x449329 VPBROADCASTQ %R12,%ZMM11 |
0x44932f VPSUBQ %ZMM11,%ZMM8,%ZMM8 |
0x449335 VPCMPNLEUQ %ZMM0,%ZMM8,%K1 |
0x44933c ADD %R12,%R11 |
0x44933f VPBROADCASTQ %R11,%ZMM8 |
0x449345 VPADDQ %ZMM0,%ZMM8,%ZMM8 |
0x44934b VPMULLQ %ZMM8,%ZMM5,%ZMM5 |
0x449351 VPADDQ %ZMM5,%ZMM6,%ZMM5 |
0x449357 KMOVQ %K1,%K2 |
0x44935c VPXOR %XMM6,%XMM6,%XMM6 |
0x449360 VGATHERQPD (,%ZMM5,1),%ZMM6{%K2} |
0x44936b VPADDQ %ZMM9,%ZMM7,%ZMM5 |
0x449371 VMOVAPD %ZMM6,%ZMM3{%K1} |
0x449377 VPSUBQ %ZMM11,%ZMM10,%ZMM6 |
0x44937d VPADDQ %ZMM1,%ZMM6,%ZMM6 |
0x449383 VPMULLQ %ZMM6,%ZMM4,%ZMM4 |
0x449389 VPADDQ %ZMM4,%ZMM5,%ZMM4 |
0x44938f VSCATTERQPD %ZMM3,(,%ZMM4,1){%K1} |
0x44939a JMP 449143 |
/home/eoseret/qaas_runs_CPU_9468/171-152-3172/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/pack_kernel.f90: 109 - 113 |
-------------------------------------------------------------------------------- |
109: DO k=y_min-depth,y_max+y_inc+depth |
110: !$OMP SIMD |
111: DO j=1,depth |
112: index= buffer_offset + j+(k+depth-1)*depth |
113: field(x_min-j,k)=left_rcv_buffer(index) |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.36 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.70 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.21 |
| Bottlenecks | |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source | pack_kernel.f90:109-113 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 10.63 |
| CQA cycles if no scalar integer | 7.79 |
| CQA cycles if FP arith vectorized | 10.63 |
| CQA cycles if fully vectorized | 6.26 |
| Front-end cycles | 10.63 |
| DIV/SQRT cycles | 8.40 |
| P0 cycles | 5.20 |
| P1 cycles | 5.33 |
| P2 cycles | 5.33 |
| P3 cycles | 2.88 |
| P4 cycles | 8.38 |
| P5 cycles | 4.85 |
| P6 cycles | 2.88 |
| P7 cycles | 2.88 |
| P8 cycles | 2.88 |
| P9 cycles | 4.82 |
| P10 cycles | 5.33 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 10.68 - 17.20 |
| Stall cycles (UFS) | 0.75 - 7.26 |
| Nb insns | 49.00 |
| Nb uops | 63.75 |
| Nb loads | 12.50 |
| Nb stores | 2.25 |
| Nb stack references | 5.75 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 11.91 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 116.00 |
| Bytes stored | 46.00 |
| Stride 0 | 1.50 |
| Stride 1 | 0.00 |
| Stride n | 0.50 |
| Stride unknown | 1.25 |
| Stride indirect | 2.50 |
| Vectorization ratio all | 18.26 |
| Vectorization ratio load | 12.22 |
| Vectorization ratio store | 25.00 |
| Vectorization ratio mul | 44.44 |
| Vectorization ratio add_sub | 41.32 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 10.32 |
| Vector-efficiency ratio all | 24.95 |
| Vector-efficiency ratio load | 23.19 |
| Vector-efficiency ratio store | 34.38 |
| Vector-efficiency ratio mul | 47.92 |
| Vector-efficiency ratio add_sub | 46.92 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 16.44 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 16.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.33 |
| Bottlenecks | micro-operation queue, |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source | pack_kernel.f90:109-113 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 1.33 |
| CQA cycles if no scalar integer | 1.33 |
| CQA cycles if FP arith vectorized | 1.33 |
| CQA cycles if fully vectorized | 0.08 |
| Front-end cycles | 1.33 |
| DIV/SQRT cycles | 1.00 |
| P0 cycles | 1.00 |
| P1 cycles | 0.00 |
| P2 cycles | 0.00 |
| P3 cycles | 0.00 |
| P4 cycles | 1.00 |
| P5 cycles | 1.00 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 1.00 |
| P10 cycles | 0.00 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 1.39 |
| Stall cycles (UFS) | 0.00 |
| Nb insns | 9.00 |
| Nb uops | 8.00 |
| Nb loads | 0.00 |
| Nb stores | 0.00 |
| Nb stack references | 0.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | 0.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 0.00 |
| Stride indirect | 0.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | NA |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 6.25 |
| Vector-efficiency ratio load | NA |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 6.25 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 6.25 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.17 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.35 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.13 |
| Bottlenecks | micro-operation queue, |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source | pack_kernel.f90:109-113 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 15.83 |
| CQA cycles if no scalar integer | 13.50 |
| CQA cycles if FP arith vectorized | 15.83 |
| CQA cycles if fully vectorized | 11.72 |
| Front-end cycles | 15.83 |
| DIV/SQRT cycles | 14.00 |
| P0 cycles | 6.00 |
| P1 cycles | 7.67 |
| P2 cycles | 7.67 |
| P3 cycles | 4.50 |
| P4 cycles | 14.00 |
| P5 cycles | 6.00 |
| P6 cycles | 4.50 |
| P7 cycles | 4.50 |
| P8 cycles | 4.50 |
| P9 cycles | 6.00 |
| P10 cycles | 7.67 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 16.32 - 28.59 |
| Stall cycles (UFS) | 1.98 - 14.24 |
| Nb insns | 65.00 |
| Nb uops | 95.00 |
| Nb loads | 16.00 |
| Nb stores | 2.00 |
| Nb stack references | 7.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 15.16 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 168.00 |
| Bytes stored | 72.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 0.00 |
| Stride unknown | 1.00 |
| Stride indirect | 4.00 |
| Vectorization ratio all | 38.89 |
| Vectorization ratio load | 20.00 |
| Vectorization ratio store | 50.00 |
| Vectorization ratio mul | 66.67 |
| Vectorization ratio add_sub | 77.78 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 22.22 |
| Vector-efficiency ratio all | 43.40 |
| Vector-efficiency ratio load | 30.00 |
| Vector-efficiency ratio store | 56.25 |
| Vector-efficiency ratio mul | 68.75 |
| Vector-efficiency ratio add_sub | 79.86 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 26.39 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 3.50 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 13.07 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.32 |
| Bottlenecks | micro-operation queue, |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source | pack_kernel.f90:109-113 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 8.17 |
| CQA cycles if no scalar integer | 2.33 |
| CQA cycles if FP arith vectorized | 8.17 |
| CQA cycles if fully vectorized | 0.63 |
| Front-end cycles | 8.17 |
| DIV/SQRT cycles | 4.60 |
| P0 cycles | 6.20 |
| P1 cycles | 5.33 |
| P2 cycles | 5.33 |
| P3 cycles | 1.50 |
| P4 cycles | 4.53 |
| P5 cycles | 4.60 |
| P6 cycles | 1.50 |
| P7 cycles | 1.50 |
| P8 cycles | 1.50 |
| P9 cycles | 4.67 |
| P10 cycles | 5.33 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 8.32 |
| Stall cycles (UFS) | 0.00 |
| Nb insns | 49.00 |
| Nb uops | 49.00 |
| Nb loads | 16.00 |
| Nb stores | 3.00 |
| Nb stack references | 8.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 16.65 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 112.00 |
| Bytes stored | 24.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 1.00 |
| Stride unknown | 2.00 |
| Stride indirect | 2.00 |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | 0.00 |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 10.53 |
| Vector-efficiency ratio load | 12.50 |
| Vector-efficiency ratio store | 12.50 |
| Vector-efficiency ratio mul | 6.25 |
| Vector-efficiency ratio add_sub | 12.50 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 9.03 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.23 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 1.36 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.23 |
| Bottlenecks | micro-operation queue, |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source | pack_kernel.f90:109-113 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 17.17 |
| CQA cycles if no scalar integer | 14.00 |
| CQA cycles if FP arith vectorized | 17.17 |
| CQA cycles if fully vectorized | 12.60 |
| Front-end cycles | 17.17 |
| DIV/SQRT cycles | 14.00 |
| P0 cycles | 7.60 |
| P1 cycles | 8.33 |
| P2 cycles | 8.33 |
| P3 cycles | 5.50 |
| P4 cycles | 14.00 |
| P5 cycles | 7.80 |
| P6 cycles | 5.50 |
| P7 cycles | 5.50 |
| P8 cycles | 5.50 |
| P9 cycles | 7.60 |
| P10 cycles | 8.33 |
| P11 cycles | 0.00 |
| Inter-iter dependencies cycles | 0 |
| FE+BE cycles (UFS) | 16.69 - 30.48 |
| Stall cycles (UFS) | 1.02 - 14.80 |
| Nb insns | 73.00 |
| Nb uops | 103.00 |
| Nb loads | 18.00 |
| Nb stores | 4.00 |
| Nb stack references | 8.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 15.84 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 184.00 |
| Bytes stored | 88.00 |
| Stride 0 | 2.00 |
| Stride 1 | 0.00 |
| Stride n | 1.00 |
| Stride unknown | 2.00 |
| Stride indirect | 4.00 |
| Vectorization ratio all | 34.15 |
| Vectorization ratio load | 16.67 |
| Vectorization ratio store | 25.00 |
| Vectorization ratio mul | 66.67 |
| Vectorization ratio add_sub | 87.50 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 19.05 |
| Vector-efficiency ratio all | 39.63 |
| Vector-efficiency ratio load | 27.08 |
| Vector-efficiency ratio store | 34.38 |
| Vector-efficiency ratio mul | 68.75 |
| Vector-efficiency ratio add_sub | 89.06 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 24.11 |
| Path / |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source file and lines | pack_kernel.f90:109-113 |
| Module | exec |
| nb instructions | 49 |
| nb uops | 63.75 |
| loop length | 216.50 |
| used x86 registers | 12.75 |
| used mmx registers | 0 |
| used xmm registers | 0.50 |
| used ymm registers | 0 |
| used zmm registers | 6 |
| nb stack references | 5.75 |
| micro-operation queue | 10.63 cycles |
| front end | 10.63 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 8.40 | 4.80 | 5.33 | 5.33 | 2.88 | 8.38 | 4.85 | 2.88 | 2.88 | 2.88 | 4.82 | 5.33 |
| cycles | 8.40 | 5.20 | 5.33 | 5.33 | 2.88 | 8.38 | 4.85 | 2.88 | 2.88 | 2.88 | 4.82 | 5.33 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 10.68-17.20 |
| Stall cycles | 0.75-7.26 |
| ROB full (events) | 0.93-7.74 |
| Front-end | 10.63 |
| Dispatch | 8.80 |
| Data deps. | 0.00 |
| Overall L1 | 10.63 |
| all | 15% |
| load | 0% |
| store | 0% |
| mul | 44% |
| add-sub | 41% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 5% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 100% |
| all | 18% |
| load | 12% |
| store | 25% |
| mul | 44% |
| add-sub | 41% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 10% |
| all | 22% |
| load | 12% |
| store | 12% |
| mul | 47% |
| add-sub | 46% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 100% |
| all | 24% |
| load | 23% |
| store | 34% |
| mul | 47% |
| add-sub | 46% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 16% |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source file and lines | pack_kernel.f90:109-113 |
| Module | exec |
| nb instructions | 9 |
| nb uops | 8 |
| loop length | 28 |
| used x86 registers | 6 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 0 |
| micro-operation queue | 1.33 cycles |
| front end | 1.33 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 |
| cycles | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 1.39 |
| Stall cycles | 0.00 |
| Front-end | 1.33 |
| Dispatch | 1.00 |
| Data deps. | 0.00 |
| Overall L1 | 1.33 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 6% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 6% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 6% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MOV %ESI,%R10D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| LEA 0x1(%R8),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %R10D,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP %EDX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4490b6 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x76> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| TEST %ESI,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JLE 449140 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source file and lines | pack_kernel.f90:109-113 |
| Module | exec |
| nb instructions | 65 |
| nb uops | 95 |
| loop length | 303 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 0 |
| used zmm registers | 11 |
| nb stack references | 7 |
| micro-operation queue | 15.83 cycles |
| front end | 15.83 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 14.00 | 6.00 | 7.67 | 7.67 | 4.50 | 14.00 | 6.00 | 4.50 | 4.50 | 4.50 | 6.00 | 7.67 |
| cycles | 14.00 | 6.00 | 7.67 | 7.67 | 4.50 | 14.00 | 6.00 | 4.50 | 4.50 | 4.50 | 6.00 | 7.67 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 16.32-28.59 |
| Stall cycles | 1.98-14.24 |
| ROB full (events) | 2.39-15.01 |
| Front-end | 15.83 |
| Dispatch | 14.00 |
| Data deps. | 0.00 |
| Overall L1 | 15.83 |
| all | 33% |
| load | 0% |
| store | 0% |
| mul | 66% |
| add-sub | 77% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 12% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 100% |
| all | 38% |
| load | 20% |
| store | 50% |
| mul | 66% |
| add-sub | 77% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 22% |
| all | 38% |
| load | 12% |
| store | 12% |
| mul | 68% |
| add-sub | 79% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 17% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 100% |
| all | 43% |
| load | 30% |
| store | 56% |
| mul | 68% |
| add-sub | 79% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LEA 0x1(%R8),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %R10D,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP %EDX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4490b6 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x76> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| TEST %ESI,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JLE 449140 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV -0x68(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| LEA (%RAX,%R8,1),%R11D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| MOV -0x70(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOVSXD (%RAX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RAX),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10aa1f(%RIP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10aa50(%RIP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RBX),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x38(%RBX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| MOV 0x18(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOVSXD (%RDX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x50(%RBX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %ESI,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %RBX,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOVSXD %R11D,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| MOV $-0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| AND %RSI,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 449280 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x240> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VPBROADCASTQ %RBX,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPBROADCASTQ %R9,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| LEA -0x1(%R11,%R10,1),%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| IMUL %R10D,%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOVSXD %ESI,%R11 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| ADD %RCX,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VPBROADCASTQ %R13,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPBROADCASTQ %R15,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| INC %R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| IMUL %R14,%RDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPBROADCASTQ %RDX,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPBROADCASTQ %RAX,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV -0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VPBROADCASTQ %RAX,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| MOV -0x2c(%RBP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| JMP 449329 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x2e9> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
| VPBROADCASTQ %R12,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSUBQ %ZMM11,%ZMM8,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
| VPCMPNLEUQ %ZMM0,%ZMM8,%K1 | |||||||||||||||
| ADD %R12,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VPBROADCASTQ %R11,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPADDQ %ZMM0,%ZMM8,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPMULLQ %ZMM8,%ZMM5,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
| VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| KMOVQ %K1,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VGATHERQPD (,%ZMM5,1),%ZMM6{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VPADDQ %ZMM9,%ZMM7,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VMOVAPD %ZMM6,%ZMM3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VPSUBQ %ZMM11,%ZMM10,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
| VPADDQ %ZMM1,%ZMM6,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPMULLQ %ZMM6,%ZMM4,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
| VPADDQ %ZMM4,%ZMM5,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VSCATTERQPD %ZMM3,(,%ZMM4,1){%K1} | 20 | 2.20 | 0.20 | 0 | 0 | 4 | 0.20 | 0.20 | 4 | 4 | 4 | 0.20 | 0 | 2-12 | 7 |
| JMP 449143 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x103> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source file and lines | pack_kernel.f90:109-113 |
| Module | exec |
| nb instructions | 49 |
| nb uops | 49 |
| loop length | 194 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 2 |
| nb stack references | 8 |
| micro-operation queue | 8.17 cycles |
| front end | 8.17 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 4.60 | 4.60 | 5.33 | 5.33 | 1.50 | 4.53 | 4.60 | 1.50 | 1.50 | 1.50 | 4.67 | 5.33 |
| cycles | 4.60 | 6.20 | 5.33 | 5.33 | 1.50 | 4.53 | 4.60 | 1.50 | 1.50 | 1.50 | 4.67 | 5.33 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 8.32 |
| Stall cycles | 0.00 |
| Front-end | 8.17 |
| Dispatch | 6.20 |
| Data deps. | 0.00 |
| Overall L1 | 8.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 10% |
| load | 12% |
| store | 12% |
| mul | 6% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 9% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LEA 0x1(%R8),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %R10D,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP %EDX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4490b6 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x76> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| TEST %ESI,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JLE 449140 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV -0x68(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| LEA (%RAX,%R8,1),%R11D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| MOV -0x70(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOVSXD (%RAX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RAX),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10aa1f(%RIP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10aa50(%RIP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RBX),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x38(%RBX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| MOV 0x18(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOVSXD (%RDX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x50(%RBX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %ESI,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %RBX,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOVSXD %R11D,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| MOV $-0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| AND %RSI,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 449280 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x240> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA (%R10,%RDI,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| IMUL %R10D,%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOVSXD %ESI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| LEA -0x1(%R11,%R10,1),%R11D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %R10,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| IMUL %R10D,%R11D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOVSXD %R11D,%R11 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| ADD %RCX,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VPBROADCASTQ %R13,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VPBROADCASTQ %RCX,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| CMP %RBX,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JNE 449300 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x2c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV -0x2c(%RBP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV -0x60(%RBP),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| JMP 449143 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x103> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
| Function | clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99 |
| Source file and lines | pack_kernel.f90:109-113 |
| Module | exec |
| nb instructions | 73 |
| nb uops | 103 |
| loop length | 341 |
| used x86 registers | 15 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 0 |
| used zmm registers | 11 |
| nb stack references | 8 |
| micro-operation queue | 17.17 cycles |
| front end | 17.17 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 14.00 | 7.60 | 8.33 | 8.33 | 5.50 | 14.00 | 7.80 | 5.50 | 5.50 | 5.50 | 7.60 | 8.33 |
| cycles | 14.00 | 7.60 | 8.33 | 8.33 | 5.50 | 14.00 | 7.80 | 5.50 | 5.50 | 5.50 | 7.60 | 8.33 |
| Cycles executing div or sqrt instructions | NA |
| Longest recurrence chain latency (RecMII) | 0.00 |
| FE+BE cycles | 16.69-30.48 |
| Stall cycles | 1.02-14.80 |
| ROB full (events) | 1.34-15.95 |
| Front-end | 17.17 |
| Dispatch | 14.00 |
| Data deps. | 0.00 |
| Overall L1 | 17.17 |
| all | 28% |
| load | 0% |
| store | 0% |
| mul | 66% |
| add-sub | 87% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 10% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 100% |
| all | 34% |
| load | 16% |
| store | 25% |
| mul | 66% |
| add-sub | 87% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 19% |
| all | 34% |
| load | 12% |
| store | 12% |
| mul | 68% |
| add-sub | 89% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 16% |
| all | 100% |
| load | 100% |
| store | 100% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 100% |
| all | 39% |
| load | 27% |
| store | 34% |
| mul | 68% |
| add-sub | 89% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 24% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LEA 0x1(%R8),%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| INC %EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %R10D,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| CMP %EDX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| MOV %EAX,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| JE 4490b6 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x76> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| TEST %ESI,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
| JLE 449140 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x100> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| MOV -0x68(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| LEA (%RAX,%R8,1),%R11D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| MOV -0x70(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOVSXD (%RAX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV -0x58(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RAX),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10aa1f(%RIP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10aa50(%RIP),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x10(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV (%RBX),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x38(%RBX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| MOV 0x18(%RBP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOVSXD (%RDX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV 0x50(%RBX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV %ESI,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOV %RBX,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
| MOVSXD %R11D,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| MOV $-0x8,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| AND %RSI,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| JE 449280 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x240> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| LEA (%R10,%RDI,1),%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
| IMUL %R10D,%ESI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOVSXD %ESI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| ADD %RCX,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| LEA -0x1(%R11,%R10,1),%R11D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %R10,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| IMUL %R10D,%R11D | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOVSXD %R11D,%R11 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
| ADD %RCX,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VPBROADCASTQ %R13,%ZMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VPBROADCASTQ %RCX,%ZMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
| XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| CMP %RBX,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| JNE 449300 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x2c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
| VPBROADCASTQ %R13,%ZMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPBROADCASTQ %R15,%ZMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPBROADCASTQ %R9,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV -0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VPBROADCASTQ %RAX,%ZMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPBROADCASTQ %RBX,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| MOV -0x2c(%RBP),%EDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| MOV -0x60(%RBP),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
| VPBROADCASTQ %R12,%ZMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPSUBQ %ZMM11,%ZMM8,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
| VPCMPNLEUQ %ZMM0,%ZMM8,%K1 | |||||||||||||||
| ADD %R12,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
| VPBROADCASTQ %R11,%ZMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| VPADDQ %ZMM0,%ZMM8,%ZMM8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPMULLQ %ZMM8,%ZMM5,%ZMM5 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
| VPADDQ %ZMM5,%ZMM6,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| KMOVQ %K1,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VPXOR %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
| VGATHERQPD (,%ZMM5,1),%ZMM6{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
| VPADDQ %ZMM9,%ZMM7,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VMOVAPD %ZMM6,%ZMM3{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
| VPSUBQ %ZMM11,%ZMM10,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
| VPADDQ %ZMM1,%ZMM6,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VPMULLQ %ZMM6,%ZMM4,%ZMM4 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
| VPADDQ %ZMM4,%ZMM5,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| VSCATTERQPD %ZMM3,(,%ZMM4,1){%K1} | 20 | 2.20 | 0.20 | 0 | 0 | 4 | 0.20 | 0.20 | 4 | 4 | 4 | 0.20 | 0 | 2-12 | 7 |
| JMP 449143 <pack_kernel_module_mp_clover_unpack_message_left_.DIR.OMP.PARALLEL.LOOP.2.split99+0x103> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
