| Function: hypre_IJMatrixAssembleParCSR._omp_fn.1 | Module: exec | Source: IJMatrix_parcsr.c:2798-2812 | Coverage: 0.56% |
|---|
| Function: hypre_IJMatrixAssembleParCSR._omp_fn.1 | Module: exec | Source: IJMatrix_parcsr.c:2798-2812 | Coverage: 0.56% |
|---|
/home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/IJ_mv/IJMatrix_parcsr.c: 2798 - 2812 |
-------------------------------------------------------------------------------- |
2798: #pragma omp parallel for private (i,j,j0,temp) |
2799: #endif |
2800: for (i=0; i < num_rows; i++) |
2801: { |
2802: j0 = diag_i[i]; |
2803: for (j=j0; j < diag_i[i+1]; j++) |
2804: { |
2805: diag_j[j] -= col_0; |
2806: if (diag_j[j] == i) |
2807: { |
2808: temp = diag_data[j0]; |
2809: diag_data[j0] = diag_data[j]; |
2810: diag_data[j] = temp; |
2811: diag_j[j] = diag_j[j0]; |
2812: diag_j[j0] = i; |
0x57f7b0 PUSH %RBP |
0x57f7b1 MOV %RSP,%RBP |
0x57f7b4 PUSH %R13 |
0x57f7b6 MOV %RDI,%R13 |
0x57f7b9 PUSH %R12 |
0x57f7bb PUSH %RBX |
0x57f7bc SUB $0x8,%RSP |
0x57f7c0 CALL 40f0b0 <omp_get_num_threads@plt> |
0x57f7c5 MOV %EAX,%EBX |
0x57f7c7 CALL 40f1f0 <omp_get_thread_num@plt> |
0x57f7cc MOVSXD %EBX,%RSI |
0x57f7cf MOVSXD %EAX,%RCX |
0x57f7d2 MOV 0x18(%R13),%RAX |
0x57f7d6 CQTO |
0x57f7d8 IDIV %RSI |
0x57f7db CMP %RDX,%RCX |
0x57f7de JL 57f89e |
0x57f7e4 IMUL %RAX,%RCX |
0x57f7e8 ADD %RCX,%RDX |
0x57f7eb LEA (%RAX,%RDX,1),%R12 |
0x57f7ef CMP %R12,%RDX |
0x57f7f2 JGE 57f893 |
0x57f7f8 MOV (%R13),%R8 |
0x57f7fc LEA 0x1(%RDX),%RBX |
0x57f800 MOV 0x20(%R13),%R10 |
0x57f804 MOV 0x10(%R13),%R11 |
0x57f808 MOV 0x8(%R13),%RDI |
0x57f80c LEA (%R8,%RBX,8),%R13 |
(2823) 0x57f810 MOV -0x8(%R13),%RCX |
(2823) 0x57f814 LEA (,%RCX,8),%R9 |
(2823) 0x57f81c LEA (%R11,%R9,1),%RAX |
(2823) 0x57f820 ADD %RDI,%R9 |
(2823) 0x57f823 CMP (%R13),%RCX |
(2823) 0x57f827 JGE 57f849 |
(2823) 0x57f829 NOPL (%RAX) |
(2824) 0x57f830 MOV (%RDI,%RCX,8),%RSI |
(2824) 0x57f834 SUB %R10,%RSI |
(2824) 0x57f837 MOV %RSI,(%RDI,%RCX,8) |
(2824) 0x57f83b CMP %RSI,%RDX |
(2824) 0x57f83e JE 57f860 |
(2824) 0x57f840 INC %RCX |
(2824) 0x57f843 CMP %RCX,(%R13) |
(2824) 0x57f847 JG 57f830 |
(2823) 0x57f849 MOV %RBX,%RDX |
(2823) 0x57f84c ADD $0x8,%R13 |
(2823) 0x57f850 CMP %RBX,%R12 |
(2823) 0x57f853 JE 57f893 |
(2823) 0x57f855 INC %RBX |
(2823) 0x57f858 JMP 57f810 |
0x57f85a NOPW (%RAX,%RAX,1) |
(2824) 0x57f860 VMOVSD (%R11,%RCX,8),%XMM1 |
(2824) 0x57f866 VMOVSD (%RAX),%XMM0 |
(2824) 0x57f86a MOV (%R9),%R8 |
(2824) 0x57f86d VMOVSD %XMM1,(%RAX) |
(2824) 0x57f871 VMOVSD %XMM0,(%R11,%RCX,8) |
(2824) 0x57f877 MOV %R8,(%RDI,%RCX,8) |
(2824) 0x57f87b INC %RCX |
(2824) 0x57f87e MOV %RDX,(%R9) |
(2824) 0x57f881 CMP %RCX,(%R13) |
(2824) 0x57f885 JG 57f830 |
(2823) 0x57f887 MOV %RBX,%RDX |
(2823) 0x57f88a ADD $0x8,%R13 |
(2823) 0x57f88e CMP %RBX,%R12 |
(2823) 0x57f891 JNE 57f855 |
0x57f893 ADD $0x8,%RSP |
0x57f897 POP %RBX |
0x57f898 POP %R12 |
0x57f89a POP %R13 |
0x57f89c POP %RBP |
0x57f89d RET |
0x57f89e INC %RAX |
0x57f8a1 XOR %EDX,%EDX |
0x57f8a3 JMP 57f7e4 |
0x57f8a8 NOPL (%RAX,%RAX,1) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ○100.00 | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
| Path / |
| Source file and lines | IJMatrix_parcsr.c:2798-2812 |
| Module | exec |
| nb instructions | 39 |
| nb uops | 97 |
| loop length | 131 |
| used x86 registers | 13 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 0 |
| micro-operation queue | 24.25 cycles |
| front end | 24.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
| cycles | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
| Cycles executing div or sqrt instructions | 24.00-90.00 |
| FE+BE cycles | 24.32-90.19 |
| Stall cycles | 13.51-79.38 |
| ROB full (events) | 14.74-80.63 |
| Front-end | 24.25 |
| Dispatch | 19.00 |
| DIV/SQRT | 24.00-90.00 |
| Overall L1 | 24.25-90.00 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 0% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 12% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 12% |
| other | 12% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|
| PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| SUB $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| CALL 40f0b0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
| MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| CALL 40f1f0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
| MOVSXD %EBX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| MOVSXD %EAX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| MOV 0x18(%R13),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| CQTO | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
| IDIV %RSI | 57 | 14.25 | 14.25 | 0 | 0 | 0 | 14.25 | 14.25 | 0 | 42-95 | 24-90 |
| CMP %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| JL 57f89e | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
| IMUL %RAX,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| LEA (%RAX,%RDX,1),%R12 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
| CMP %R12,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| JGE 57f893 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
| MOV (%R13),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| LEA 0x1(%RDX),%RBX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
| MOV 0x20(%R13),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| MOV 0x10(%R13),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| MOV 0x8(%R13),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| LEA (%R8,%RBX,8),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| ADD $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| JMP 57f7e4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| Source file and lines | IJMatrix_parcsr.c:2798-2812 |
| Module | exec |
| nb instructions | 39 |
| nb uops | 97 |
| loop length | 131 |
| used x86 registers | 13 |
| used mmx registers | 0 |
| used xmm registers | 0 |
| used ymm registers | 0 |
| used zmm registers | 0 |
| nb stack references | 0 |
| micro-operation queue | 24.25 cycles |
| front end | 24.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
| cycles | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
| Cycles executing div or sqrt instructions | 24.00-90.00 |
| FE+BE cycles | 24.32-90.19 |
| Stall cycles | 13.51-79.38 |
| ROB full (events) | 14.74-80.63 |
| Front-end | 24.25 |
| Dispatch | 19.00 |
| DIV/SQRT | 24.00-90.00 |
| Overall L1 | 24.25-90.00 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 0% |
| other | 0% |
| all | 12% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 12% |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | 12% |
| other | 12% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|
| PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
| SUB $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| CALL 40f0b0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
| MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| CALL 40f1f0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
| MOVSXD %EBX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| MOVSXD %EAX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| MOV 0x18(%R13),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| CQTO | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
| IDIV %RSI | 57 | 14.25 | 14.25 | 0 | 0 | 0 | 14.25 | 14.25 | 0 | 42-95 | 24-90 |
| CMP %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| JL 57f89e | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
| IMUL %RAX,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
| ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| LEA (%RAX,%RDX,1),%R12 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
| CMP %R12,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| JGE 57f893 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
| MOV (%R13),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| LEA 0x1(%RDX),%RBX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
| MOV 0x20(%R13),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| MOV 0x10(%R13),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| MOV 0x8(%R13),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
| LEA (%R8,%RBX,8),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| ADD $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
| INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
| XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| JMP 57f7e4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼hypre_IJMatrixAssembleParCSR._omp_fn.1– | 0.56 | 0.2 |
| ▼Loop 2823 - IJMatrix_parcsr.c:2802-2812 - exec– | 0 | 0 |
| ○Loop 2824 - IJMatrix_parcsr.c:2803-2812 - exec | 0.56 | 0.2 |
