| Function: loadAtomsBuffer | Module: exec | Source: haloExchange.c:373-393 | Coverage: 0.05% |
|---|
| Function: loadAtomsBuffer | Module: exec | Source: haloExchange.c:373-393 | Coverage: 0.05% |
|---|
/home/hbollore/qaas-runs/170-265-5545/intel/CoMD/build/CoMD/CoMD/src-openmp/haloExchange.c: 373 - 393 |
-------------------------------------------------------------------------------- |
373: int nCells = parms->nCells[face]; |
374: int* cellList = parms->cellList[face]; |
375: int nBuf = 0; |
376: for (int iCell=0; iCell<nCells; ++iCell) |
377: { |
378: int iBox = cellList[iCell]; |
379: int iOff = iBox*MAXATOMS; |
380: for (int ii=iOff; ii<iOff+s->boxes->nAtoms[iBox]; ++ii) |
381: { |
382: buf[nBuf].gid = s->atoms->gid[ii]; |
383: buf[nBuf].type = s->atoms->iSpecies[ii]; |
384: buf[nBuf].rx = s->atoms->r[ii][0] + shift[0]; |
385: buf[nBuf].ry = s->atoms->r[ii][1] + shift[1]; |
386: buf[nBuf].rz = s->atoms->r[ii][2] + shift[2]; |
387: buf[nBuf].px = s->atoms->p[ii][0]; |
388: buf[nBuf].py = s->atoms->p[ii][1]; |
389: buf[nBuf].pz = s->atoms->p[ii][2]; |
390: ++nBuf; |
391: } |
392: } |
393: return nBuf*sizeof(AtomMsg); |
0x405940 LDR W9, [X0, X2,SXTW #2] |
0x405944 CMP W9, #1 |
0x405948 B.LT 405a70 |
0x40594c ADD X11, X0, W2,SXTW #3 |
0x405950 LDP X13, X14, [X1, #16] |
0x405954 ORR X10, XZR, XZR |
0x405958 LDR X12, [X11, #72] |
0x40595c LDR X11, [X11, #24] |
0x405960 ORR W8, WZR, WZR |
0x405964 LDP D2, D3, [X13, #72] |
0x405968 LDP D0, D1, [X12] |
0x40596c FMUL D1, D1, D3 |
0x405970 LDR D3, [X13, #88] |
0x405974 ADD X13, X3, #24 |
0x405978 FMUL D0, D0, D2 |
0x40597c LDR D2, [X12, #16] |
0x405980 LDR X12, [X14, #120] |
0x405984 MOVZ W14, #56 |
0x405988 FMUL D2, D2, D3 |
0x40598c B 4059ac |
0x405990 HINT #0 |
0x405994 HINT #0 |
0x405998 HINT #0 |
0x40599c HINT #0 |
(66) 0x4059a0 ADD X10, X10, #1 |
(66) 0x4059a4 CMP X10, X9 |
(66) 0x4059a8 B.EQ 405a74 |
(66) 0x4059ac LDRSW X15, [X11, X10,LSL #2] |
(66) 0x4059b0 LDR W16, [X12, X15,LSL #2] |
(66) 0x4059b4 CMP W16, #1 |
(66) 0x4059b8 B.LT 4059a0 |
(66) 0x4059bc LDR X0, [X1, #32] |
(66) 0x4059c0 UBFM W17, W15, #26, #25 |
(66) 0x4059c4 ORR X16, XZR, XZR |
(66) 0x4059c8 SBFM X18, X17, #0, #31 |
(66) 0x4059cc ADD X4, X18, W17,SXTW #1 |
(66) 0x4059d0 LDP X2, X3, [X0, #8] |
(66) 0x4059d4 LDP X5, X0, [X0, #24] |
(66) 0x4059d8 UBFM X4, X4, #61, #60 |
(66) 0x4059dc ADD X6, X0, X4 |
(66) 0x4059e0 ADD X5, X5, X4 |
(66) 0x4059e4 SBFM X4, X17, #62, #31 |
(66) 0x4059e8 ADD X0, X3, X4 |
(66) 0x4059ec ADD X2, X2, X4 |
(66) 0x4059f0 SMADDL X3, W8, W14, X13 |
(66) 0x4059f4 ADD X4, X6, #16 |
(66) 0x4059f8 ADD X5, X5, #16 |
(66) 0x4059fc HINT #0 |
(67) 0x405a00 LDUR D3, [X5, #496] |
(67) 0x405a04 LDR W6, [X2, X16,LSL #2] |
(67) 0x405a08 STUR W6, [X3, #488] |
(67) 0x405a0c LDR W6, [X0, X16,LSL #2] |
(67) 0x405a10 ADD X16, X16, #1 |
(67) 0x405a14 FADD D3, D0, D3 |
(67) 0x405a18 ADD X7, X18, X16 |
(67) 0x405a1c STUR D3, [X3, #496] |
(67) 0x405a20 LDUR D3, [X5, #504] |
(67) 0x405a24 STUR W6, [X3, #492] |
(67) 0x405a28 LDRSW X6, [X12, X15,LSL #2] |
(67) 0x405a2c ADD X6, X6, W17,SXTW |
(67) 0x405a30 FADD D3, D1, D3 |
(67) 0x405a34 CMP X7, X6 |
(67) 0x405a38 STUR D3, [X3, #504] |
(67) 0x405a3c LDR D3, [X5], #24 |
(67) 0x405a40 FADD D3, D2, D3 |
(67) 0x405a44 STR D3, [X3] |
(67) 0x405a48 LDUR D3, [X4, #496] |
(67) 0x405a4c STR D3, [X3, #8] |
(67) 0x405a50 LDUR D3, [X4, #504] |
(67) 0x405a54 STR D3, [X3, #16] |
(67) 0x405a58 LDR D3, [X4], #24 |
(67) 0x405a5c STR D3, [X3, #24] |
(67) 0x405a60 ADD X3, X3, #56 |
(67) 0x405a64 B.LT 405a00 |
(66) 0x405a68 ADD W8, W8, W16 |
(66) 0x405a6c B 4059a0 |
0x405a70 ORR W8, WZR, WZR |
0x405a74 UBFM W9, W8, #26, #25 |
0x405a78 SUB W0, W9, W8,LSL #3 |
0x405a7c RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►99.16+ | timestep | timestep.c:150 | exec |
| ○ | main | CoMD.c:125 | exec |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | exec |
| Path / |
| Source file and lines | haloExchange.c:373-393 |
| Module | exec |
| nb instructions | 28 |
| loop length | 112 |
| nb stack references | 0 |
| front end | 3.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.25 | 2.25 | 2.25 | 2.25 | 0.75 | 0.75 | 0.75 | 0.75 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| cycles | 1.50 | 1.50 | 2.25 | 2.25 | 2.25 | 2.25 | 0.75 | 0.75 | 0.75 | 0.75 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.00 |
| Overall L1 | 3.00 |
| all | 33% |
| load | 50% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LDR W9, [X0, X2,SXTW #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| CMP W9, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| B.LT 405a70 <loadAtomsBuffer+0x130> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| ADD X11, X0, W2,SXTW #3 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| LDP X13, X14, [X1, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
| ORR X10, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| LDR X12, [X11, #72] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| LDR X11, [X11, #24] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| ORR W8, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| LDP D2, D3, [X13, #72] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| LDP D0, D1, [X12] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| FMUL D1, D1, D3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 |
| LDR D3, [X13, #88] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| ADD X13, X3, #24 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| FMUL D0, D0, D2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 |
| LDR D2, [X12, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| LDR X12, [X14, #120] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| MOVZ W14, #56 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| FMUL D2, D2, D3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 |
| B 4059ac <loadAtomsBuffer+0x6c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| HINT #0 | ||||||||||||||||||
| HINT #0 | ||||||||||||||||||
| HINT #0 | ||||||||||||||||||
| HINT #0 | ||||||||||||||||||
| ORR W8, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| UBFM W9, W8, #26, #25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| SUB W0, W9, W8,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| Source file and lines | haloExchange.c:373-393 |
| Module | exec |
| nb instructions | 28 |
| loop length | 112 |
| nb stack references | 0 |
| front end | 3.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.25 | 2.25 | 2.25 | 2.25 | 0.75 | 0.75 | 0.75 | 0.75 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| cycles | 1.50 | 1.50 | 2.25 | 2.25 | 2.25 | 2.25 | 0.75 | 0.75 | 0.75 | 0.75 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.00 |
| Overall L1 | 3.00 |
| all | 33% |
| load | 50% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LDR W9, [X0, X2,SXTW #2] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| CMP W9, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
| B.LT 405a70 <loadAtomsBuffer+0x130> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| ADD X11, X0, W2,SXTW #3 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
| LDP X13, X14, [X1, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
| ORR X10, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| LDR X12, [X11, #72] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| LDR X11, [X11, #24] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| ORR W8, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| LDP D2, D3, [X13, #72] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| LDP D0, D1, [X12] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| FMUL D1, D1, D3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 |
| LDR D3, [X13, #88] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| ADD X13, X3, #24 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| FMUL D0, D0, D2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 |
| LDR D2, [X12, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
| LDR X12, [X14, #120] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
| MOVZ W14, #56 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| FMUL D2, D2, D3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 |
| B 4059ac <loadAtomsBuffer+0x6c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| HINT #0 | ||||||||||||||||||
| HINT #0 | ||||||||||||||||||
| HINT #0 | ||||||||||||||||||
| HINT #0 | ||||||||||||||||||
| ORR W8, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| UBFM W9, W8, #26, #25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| SUB W0, W9, W8,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼loadAtomsBuffer– | 0.05 | 0.01 |
| ▼Loop 66 - haloExchange.c:376-389 - exec– | 0 | 0.03 |
| ○Loop 67 - haloExchange.c:380-389 - exec | 0.05 | 0.57 |
