-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARM32 prefetch triggers infinite loop #1139
Comments
Currently, a prefetch instruction will result in an infinite sleep where the CPU never wakes back up as it is expecting a dcache response even though this will not be delivered for prefetches. Work around this by overriding the _status field for prefetches in completeIfetch(). This allows my test workload from gem5#1139 to continue running beyond strlen() when using TimingSimpleCpu (but not other CPUs such as MinorCPU). Partially fixes: gem5#1139 Change-Id: Ic44bdb87f4099b11a7f9c6c99768a12fbef5842e
I have reduced my test case down to the following C code which I would like to add to the gem5 tests but I'm not quite sure how to do this. //
// This test checks that software prefetches do not result in an infinite
// loop/panic. This includes prefetches with caches/MMU enabled+disabled.
//
#define __STRING(x) #x /* stringify without expanding x */
#define __XSTRING(x) __STRING(x) /* expand x, then stringify */
typedef __UINT32_TYPE__ uint32_t;
typedef __UINTPTR_TYPE__ uintptr_t;
static char stack_memory[4096];
__attribute__((naked, used)) void _start(void) {
// Set up stack for C code and jump to test_code.
__asm__("mov sp, %0" : : "r"(stack_memory));
__asm__("b test_code");
}
#define MMU_TYPE_1MB (0x2 << 0)
#define MMU_RW (0x3 << 10)
#define MMU_NORMAL_CACHEABLE ((0x0 << 12) | (0x3 << 2))
#define MMU_MAPPING_FLAGS (MMU_TYPE_1MB | MMU_RW | MMU_NORMAL_CACHEABLE)
// We need 4096 1MB mappings to cover the full 32-bit address space.
#define MMU_MAPPING_COUNT 4096
extern uint32_t identity_page_table[MMU_MAPPING_COUNT];
// clang-format off
__asm__(".section .rodata\n"
".global identity_page_table\n"
".balign 16384\n"
"identity_page_table:\n"
".set _i, 0\n" //
".rept " __XSTRING(MMU_MAPPING_COUNT) "\n"
" .4byte (_i << 20) |" __XSTRING(MMU_MAPPING_FLAGS) "\n"
" .set _i, _i + 1\n"
".endr\n"
".size identity_page_table, " __XSTRING(MMU_MAPPING_COUNT * 4) "\n"
".text");
// clang-format on
#define SCTLR_MMU (1 << 0)
#define SCTLR_DATA_L2 (1 << 2)
#define SCTLR_BRANCH_PRED (1 << 11)
#define SCTLR_ICACHE (1 << 12)
static uint32_t read_sctlr(void) {
uint32_t result;
__asm__("mrc p15, 0, %0, c1, c0, 0" : "=r"(result));
return result;
}
static void write_sctlr(uint32_t value) {
__asm__("mcr p15, 0, r0, c1, c0, 0\n"
"isb\n" ::"r"(value)
: "memory");
}
static void enable_mmu(void) {
// We have to set up an identity map and enable the MMU for caches.
// Additionally, all page table entries are set to Domain 0, so set up DACR
// so that Domain zero has permission checks enabled rather than "deny all".
__asm__("mov r0, #1\n"
"mcr p15, 0, r0, c3, c0, 0\n" // Set DACR Domain 0 permissions checked
"mcr p15, 0, %[TTBR], c2, c0, 0\n" // Write TTBR
"mov r0, #0\n"
// Note: we assume Data+L2 cache has been invalidated by reset.
"mcr p15, 0, r0, c7, c5, 0\n" // ICIALLU: invalidate instruction cache
"mcr p15, 0, r0, c8, c7, 0\n" // TLBIALL: invalidate TLB
"mcr p15, 0, r0, c7, c5, 6\n" // BPIALL: invalidate branch predictor
"isb\n" ::[TTBR] "r"(identity_page_table)
: "r0");
write_sctlr(read_sctlr() | SCTLR_MMU);
}
static void enable_caches(void) {
write_sctlr(read_sctlr() | SCTLR_ICACHE | SCTLR_BRANCH_PRED | SCTLR_DATA_L2);
}
uintptr_t call_semihosting(uintptr_t operation, uintptr_t arg) {
uintptr_t result;
__asm__ volatile("mov r0, %1\n"
"mov r1, %2\n"
"svc #0x123456\n"
"mov %0, r0\n"
: "=r"(result)
: "r"(operation), "r"(arg)
: "memory", "r0", "r1");
return result;
}
// https://github.com/ARM-software/abi-aa/blob/main/semihosting/semihosting.rst
#define SEMIHOSTING_WRITE0 0x4
#define SEMIHOSTING_EXIT 0x18
#define ADP_Stopped_ApplicationExit 0x20026
struct AtLeastOneCacheline {
char data[64];
};
#define test_prefetch(addr, msg) \
do { \
call_semihosting(SEMIHOSTING_WRITE0, \
(uintptr_t)("Prefetch with " msg "... ")); \
__asm__("pld [%0]\n" ::"r"(addr)); \
call_semihosting(SEMIHOSTING_WRITE0, (uintptr_t)"OK\n"); \
} while (0)
__attribute__((used, noreturn)) void test_code(void) {
call_semihosting(SEMIHOSTING_WRITE0, (uintptr_t)"Starting test...\n");
static struct AtLeastOneCacheline prefetch_test[4];
test_prefetch(&prefetch_test[0], "Test MMU off, caches off");
enable_mmu();
test_prefetch(&prefetch_test[1], "MMU on, caches off");
enable_caches();
test_prefetch(&prefetch_test[2], "MMU on, caches on");
write_sctlr(read_sctlr() & ~SCTLR_MMU);
// Finally test MMU off, with caches enable bits still on.
// CPU will treat this as dcache disabled since it requires MMU on.
test_prefetch(&prefetch_test[3], "MMU off, caches still on");
call_semihosting(SEMIHOSTING_WRITE0, (uintptr_t)"Test complete!\n");
call_semihosting(SEMIHOSTING_EXIT, ADP_Stopped_ApplicationExit);
__builtin_trap();
__builtin_unreachable();
} Can be compiled using I think it might be nice to add this test and the reproducer python config to the overall tests to ensure that software prefetches work for all CPU models. |
Currently, a prefetch instruction will result in an infinite sleep where the CPU never wakes back up as it is expecting a dcache response even though this will not be delivered for prefetches. Avoid this problem by rejecting uncacheable prefetches in the MMU translation logic. This allows my test workload from gem5#1139 to continue running beyond strlen(). Tested using Atomic,Timing,Minor and O3 CPU. See gem5#1139 for the test case that was used. Fixes: gem5#1139 Change-Id: Ic44bdb87f4099b11a7f9c6c99768a12fbef5842e
Describe the bug
I was trying to run a baremetal arm32 binary built using picolibc, which uses prefetch (
pld [srcin, #0]
) inside strlen(). When using TimingSimpleCpu or MinorCpu, the CPU no longer makes any forward progress and just keeps processing DRAM events.Affects version
develop: 65976e4
gem5 Modifications
If you have modified gem5 in some way please state, to the best of your ability, how it has been modified.
To Reproduce
Steps to reproduce the behavior. Please assume starting from a clean repository:
build/ALL/gem5.debug --verbose --debug-flags=Semihosting,ExecAll,Faults,Decode,Decoder,MemCtrl,Event,CacheAll,MemoryAccess --debug-file=trace.log configs/example/repro.py --kernel $HOME/picolibc-arm32-build/test/posix-io
Terminal Output
I then get expected terminal output until it ends up in an infinite loop (which does not happen with the AtomicCpu) once the prefetch instruction is executed:
Decoded pld instruction: 0x18f890f000
Last few 100 lines of debug output for
--debug-flags=Semihosting,ExecAll,Faults,Decode,Decoder,MemCtrl,Event,CacheAll,MemoryAccess
before the infinite loop.Expected behavior
Prefetch sends request to DRAM and CPU continues execution. I tried modifying timing.cc to change
if (_status == BaseSimpleCPU::Running)
toif (_status == BaseSimpleCPU::Running || curStaticInst->isPrefetch()) {
but that didn't fix the problem.Host Operating System
Debian
Host ISA
ARM
Compiler used
System GCC (12).
The text was updated successfully, but these errors were encountered: