GCC rychle kopirovani pameti
Jaroslav Buchta
jaroslav.buchta na hascomp.cz
Pátek Červen 8 12:01:10 CEST 2018
Zdravim, dnes se zase nestiham divit, nevim, na cem je to jak zavisle
ale bezna funkce memcpy vede jak v optimalizovanem tak v
neoptimalizovanem prekladu k hloupemu kopirovani po 1B
Zkusil jsem si napsat takovouto fci
void fastcpy(void *dst, void *src, int len)
{
while (len >= 8)
{
*(uint64_t*)dst = *(uint64_t*)src;
dst += 8;
src += 8;
len -= 8;
}
while (len >= 4)
{
*(uint32_t*)dst = *(uint32_t*)src;
dst += 4;
src += 4;
len -= 4;
}
if (len >= 2)
{
*(uint16_t*)dst = *(uint16_t*)src;
dst += 2;
src += 2;
len -= 2;
}
if (len != 0)
{
*(uint8_t*)dst = *(uint8_t*)src;
}
}
ktera vede ke kyzenemu vysledku, je to vyrazne rychlejsi, ovsem POUZE,
KDYZ NENI NASTAVENA OPTIMALIZACE O3, tam nejak prekladac usoudi, ze je
to kopirovani bloku a vola memcpy... Je to vubec mozne, nebo delam
nejakou chybu???
Jak mam vynutit volani te funkce? (krome obchuzky, ze pro ni zakazu
optimalizaci...)
////////////////////////////////
Optimalizovany kod:
100 mccState.iB = (float)(mccState.bufAdc1[2]);
080004c2: vstr s14, [r4, #68] ; 0x44
080004c6: vstr s15, [r4, #72] ; 0x48
102 if (!mccStateCopy.f_LockCopy)
080004ca: bmi.n 0x8000400 <HAL_ADC_ConvCpltCallback+24>
080004cc: movs r2, #72 ; 0x48
080004ce: mov r1, r4
080004d0: add.w r0, r4, #76 ; 0x4c
080004d4: bl 0x8005e00 <memcpy>
72 *(uint32_t*)dst = *(uint32_t*)src;
Neoptimalizovany kod:
101 mccState.iC = (float)(mccState.bufAdc1[3]);
08000528: vmov s15, r1
0800052c: vcvt.f32.u32 s15, s15
08000530: vstr s15, [r3, #72] ; 0x48
102 if (!mccStateCopy.f_LockCopy)
08000534: ldrb.w r3, [r3, #76] ; 0x4c
08000538: tst.w r3, #2
0800053c: bne.n 0x8000470 <HAL_ADC_ConvCpltCallback+24>
104 fastcpy((void *)&mccStateCopy, (void *)&mccState,
sizeof(MCCOMSTATE));
0800053e: ldr r0, [pc, #16] ; (0x8000550
<HAL_ADC_ConvCpltCallback+248>)
08000540: movs r2, #76 ; 0x4c
08000542: mov r1, r0
08000544: add r0, r2
08000546: bl 0x80003f4 <fastcpy>
fastcpy:
080003f4: cmp r2, #7
080003f6: ble.n 0x800043a <fastcpy+70>
62 {
080003f8: push {r4, r5}
65 *(uint64_t*)dst = *(uint64_t*)src;
080003fa: ldrd r4, r5, [r1], #8
080003fe: strd r4, r5, [r0], #8
68 len -= 8;
08000402: subs r2, #8
63 while (len >= 8)
08000404: cmp r2, #7
08000406: bgt.n 0x80003fa <fastcpy+6>
08000408: b.n 0x8000414 <fastcpy+32>
72 *(uint32_t*)dst = *(uint32_t*)src;
0800040a: ldr.w r3, [r1], #4
0800040e: str.w r3, [r0], #4
75 len -= 4;
08000412: subs r2, #4
70 while (len >= 4)
08000414: cmp r2, #3
08000416: bgt.n 0x800040a <fastcpy+22>
77 if (len >= 2)
08000418: cmp r2, #1
0800041a: ble.n 0x8000426 <fastcpy+50>
79 *(uint16_t*)dst = *(uint16_t*)src;
0800041c: ldrh.w r3, [r1], #2
08000420: strh.w r3, [r0], #2
82 len -= 2;
08000424: subs r2, #2
84 if (len != 0)
08000426: cbz r2, 0x800042c <fastcpy+56>
86 *(uint8_t*)dst = *(uint8_t*)src;
08000428: ldrb r3, [r1, #0]
0800042a: strb r3, [r0, #0]
88 }
0800042c: pop {r4, r5}
0800042e: bx lr
08000430: ldr.w r3, [r1], #4
08000434: str.w r3, [r0], #4
75 len -= 4;
08000438: subs r2, #4
0800043a: cmp r2, #3
0800043c: bgt.n 0x8000430 <fastcpy+60>
0800043e: cmp r2, #1
08000440: ble.n 0x800044c <fastcpy+88>
08000442: ldrh.w r3, [r1], #2
08000446: strh.w r3, [r0], #2
82 len -= 2;
0800044a: subs r2, #2
84 if (len != 0)
0800044c: cmp r2, #0
0800044e: beq.n 0x800042e <fastcpy+58>
86 *(uint8_t*)dst = *(uint8_t*)src;
08000450: ldrb r3, [r1, #0]
08000452: strb r3, [r0, #0]
08000454: bx lr
Další informace o konferenci Hw-list