>>>>>: Linux kernel

Showing posts with label Linux kernel. Show all posts

Monday, September 25, 2017

Linux kernel debugging with GDB: getting a task running on a CPU

The current task is saved in per-cpu space for x86-64 and is accessed through the gs register at current_task offset as

mov    %gs:0xd440,%rdx

(gdb) p/x &current_task
$63 = 0xd440
(gdb) p/x __per_cpu_offset[0]
$64 = 0xffff88001fc00000
(gdb) x/gx 0xffff88001fc00000+0xd440
0xffff88001fc0d440: 0xffff88001dea6a00
(gdb) p/d ((struct task_struct*)0xffff88001dea6a00)->pid
$67 = 243
(gdb) p/x ((struct task_struct*)0xffff88001dea6a00)->mm
$69 = 0xffff88001d1bc800
(gdb) p/x ((struct task_struct*)0xffff88001dea6a00)->active_mm
$70 = 0xffff88001d1bc800
(gdb) p/x __per_cpu_offset[2]
$73 = 0xffff88001fd00000
(gdb) x/gx 0xffff88001fd00000+0xd440
0xffff88001fd0d440: 0xffff88001f240000
(gdb) p/x ((struct task_struct*)0xffff88001f240000)->pid
$74 = 0x1
(gdb) lx-ps
0xffffffff81e104c0 <init_task> 0 swapper/0
0xffff88001f240000 1 systemd
0xffff88001f240d40 2 kthreadd
0xffff88001f2427c0 4 kworker/0:0H
0xffff88001f244240 6 mm_percpu_wq
0xffff88001f244f80 7 ksoftirqd/0
0xffff88001f245cc0 8 rcu_sched
0xffff88001f246a00 9 rcu_bh
0xffff88001f298000 10 migration/0
0xffff88001f298d40 11 watchdog/0
0xffff88001f29c240 12 cpuhp/0
0xffff88001f29cf80 13 cpuhp/1
0xffff88001f29dcc0 14 watchdog/1
0xffff88001f29ea00 15 migration/1
0xffff88001f2c8000 16 ksoftirqd/1
0xffff88001f2c9a80 18 kworker/1:0H
0xffff88001f2ca7c0 19 cpuhp/2
0xffff88001f2cb500 20 watchdog/2
0xffff88001f2cc240 21 migration/2
0xffff88001f2ccf80 22 ksoftirqd/2
0xffff88001f2cea00 24 kworker/2:0H
0xffff88001f310000 25 cpuhp/3
0xffff88001f310d40 26 watchdog/3
0xffff88001f311a80 27 migration/3
0xffff88001f3127c0 28 ksoftirqd/3
0xffff88001f314240 30 kworker/3:0H
0xffff88001f314f80 31 kdevtmpfs
0xffff88001f315cc0 32 netns
0xffff88001dc28000 34 khungtaskd
0xffff88001dc28d40 35 oom_reaper
0xffff88001dc29a80 36 writeback
0xffff88001dc2a7c0 37 kcompactd0
0xffff88001dc2b500 38 ksmd
0xffff88001dc2c240 39 crypto
0xffff88001dc2cf80 40 kintegrityd
0xffff88001dc2dcc0 41 bioset
0xffff88001dc2ea00 42 kblockd
0xffff88001dcd8000 43 ata_sff
0xffff88001dcd8d40 44 md
0xffff88001dcd9a80 45 edac-poller
0xffff88001dcda7c0 46 devfreq_wq
0xffff88001dcdb500 47 watchdogd
0xffff88001dcdc240 48 kworker/1:1
0xffff88001dcdcf80 49 kworker/2:1
0xffff88001dcddcc0 50 kworker/3:1
0xffff88001ddf8000 52 kauditd
0xffff88001ddf8d40 53 kswapd0
0xffff88001ddf9a80 54 bioset
0xffff88001ddfa7c0 55 ecryptfs-kthrea
0xffff88001dff0d40 72 kthrotld
0xffff88001dff1a80 73 acpi_thermal_pm
0xffff88001dff27c0 74 bioset
0xffff88001dff3500 75 bioset
0xffff88001dff4240 76 bioset
0xffff88001dff4f80 77 bioset
0xffff88001dff5cc0 78 bioset
0xffff88001dff6a00 79 bioset
0xffff88001dff0000 80 bioset
0xffff88001d660000 81 bioset
0xffff88001d660d40 82 scsi_eh_0
0xffff88001d661a80 83 scsi_tmf_0
0xffff88001d6627c0 84 scsi_eh_1
0xffff88001d663500 85 scsi_tmf_1
0xffff88001d718d40 91 ipv6_addrconf
0xffff88001d71dcc0 104 charger_manager
0xffff88001d71a7c0 105 bioset
0xffff88001d71ea00 106 bioset
0xffff88001d71c240 107 bioset
0xffff88001d719a80 110 jbd2/sda-8
0xffff88001d718000 111 ext4-rsv-conver
0xffff88001ddfdcc0 123 kworker/1:1H
0xffff88001ddfcf80 124 kworker/2:1H
0xffff88001ddfc240 127 kworker/0:1H
0xffff88001f350d40 135 kworker/3:2
0xffff88001dea4240 137 kworker/1:2
0xffff88001d0b0d40 140 systemd-journal
0xffff88001dea27c0 142 kworker/2:2
0xffff88001dea0d40 146 kworker/0:3
0xffff88001ded6a00 153 systemd-udevd
0xffff88001dea0000 156 kworker/3:1H
0xffff88001dea5cc0 227 cron
0xffff88001dea1a80 229 rsyslogd
0xffff88001ded0d40 235 in:imuxsock
0xffff88001ded0000 236 in:imklog
0xffff88001ded27c0 237 rs:main Q:Reg
0xffff88001ded1a80 233 agetty
0xffff88001c5d8d40 234 login
0xffff88001dea6a00 243 bash
0xffff88001dea3500 248 kworker/u8:2
0xffff88001c5d9a80 251 kworker/0:1
0xffff88001c5dc240 445 kworker/u8:1
0xffff88001c5ddcc0 452 kworker/u8:0

Sunday, August 13, 2017

Unwinding a kernel mode stack for exception in Linux.

Generally GDB is unable to unwind a kernel call stack with an exception frame on it. The unwinding stops on an exception processing. For example

(gdb) bt
#0  delay_tsc (__loops=5241148) at ../arch/x86/lib/delay.c:78
#1  0xffffffff8134b532 in __delay (loops=<optimised out>) at ../arch/x86/lib/delay.c:160
#2  __const_udelay (xloops=<optimised out>) at ../arch/x86/lib/delay.c:174
#3  0xffffffff81132620 in panic (fmt=<optimised out>) at ../kernel/panic.c:297
#4  0xffffffff8101f080 in oops_end (flags=70, regs=0xffffc90000227c18, signr=9) at ../arch/x86/kernel/dumpstack.c:235
#5  0xffffffff8104d1c7 in no_context (regs=0xffffc90000227c18, error_code=0, address=8, signal=<optimised out>, si_code=<optimised out>) at ../arch/x86/mm/fault.c:867
#6  0xffffffff8104d456 in __bad_area_nosemaphore (regs=0xffffc90000227c18, error_code=0, address=8, vma=<optimised out>, si_code=196609) at ../arch/x86/mm/fault.c:953
#7  0xffffffff8104d59f in bad_area_nosemaphore (regs=<optimised out>, error_code=<optimised out>, address=<optimised out>, vma=<optimised out>) at ../arch/x86/mm/fault.c:960
#8  0xffffffff8104d8e7 in __do_page_fault (regs=0xffffc90000227c18, error_code=0, address=8) at ../arch/x86/mm/fault.c:1387
#9  0xffffffff8104dccc in do_page_fault (regs=<optimised out>, error_code=<optimised out>) at ../arch/x86/mm/fault.c:1508
#10 0xffffffff8193ecd2 in page_fault () at ../arch/x86/entry/entry_64.S:1005
#11 0xffff88001decdb40 in ?? ()
#12 0xffff88001c02ae48 in ?? ()
#13 0xffffc90000227d98 in ?? ()
#14 0xffff88001d733000 in ?? ()
#15 0xffffc90000227cf0 in ?? ()
#16 0xffff88001d733000 in ?? ()
#17 0xffff88001cd02510 in ?? ()
#18 0xffffc90000227e18 in ?? ()
#19 0x0000000000000000 in ?? ()

Linux has a structure struct pt_regs to save thread context state. A pointer to this strucrue is provided to an exception processing routine and contains a context of a thread when an exception happened. Using register values from this structure a call stack at the moment of exception can be captured with GDB.

(gdb) f 8
#8  0xffffffff8104d8e7 in __do_page_fault (regs=0xffffc90000227c18, error_code=0, address=8) at ../arch/x86/mm/fault.c:1387
1387    bad_area_nosemaphore(regs, error_code, address, NULL);

Having a valid regs pointer set register values.

(gdb) p/x *regs
$2 = {r15 = 0xffff88001decdb40, r14 = 0xffff88001c02ae48, r13 = 0xffffc90000227d98, r12 = 0xffff88001d733000, bp = 0xffffc90000227cf0, bx = 0xffff88001d733000, r11 = 0xffff88001cd02510, 
  r10 = 0xffffc90000227e18, r9 = 0x0, r8 = 0xffff88001fc9d180, ax = 0x0, cx = 0x0, dx = 0x1000, si = 0xffff88001d733000, di = 0xffffc90000227d00, orig_ax = 0xffffffffffffffff, ip = 0xffffffff811aa30c, 
  cs = 0x10, flags = 0x246, sp = 0xffffc90000227cc8, ss = 0x18}
(gdb) set $rsp=0xffffc90000227cc8
(gdb) set $rip=0xffffffff811aa30c.
(gdb) set $rbp=0xffffc90000227cf0
(gdb) set $rbx=0xffff88001d733000
(gdb) set $r15=0xffff88001decdb40
(gdb) set $r14=0xffff88001c02ae48
(gdb) set $r13=0xffffc90000227d98
(gdb) set $r12=0xffff88001d733000
(gdb) set $r11=0xffff88001cd02510
(gdb) set $r10=0xffffc90000227e18
(gdb) set $r9=0
(gdb) set $rsi=0xffff88001d733000
(gdb) set $rdi=0xffffc90000227d00

Now a call stack at the momemnt of exception can be examined.

(gdb) bt
#0  __read_once_size (size=<optimised out>, res=<optimised out>, p=<optimised out>) at ../include/linux/compiler.h:254
#1  __read_seqcount_begin (s=<optimised out>) at ../include/linux/seqlock.h:112
#2  raw_read_seqcount_begin (s=<optimised out>) at ../include/linux/seqlock.h:147
#3  read_seqcount_begin (s=<optimised out>) at ../include/linux/seqlock.h:164
#4  get_fs_root_rcu (root=<optimised out>, fs=<optimised out>) at ../fs/dcache.c:3222
#5  d_path (path=0xffffc90000227d00, buf=0xffff88001d733000 "", buflen=4096) at ../fs/dcache.c:3265
#6  0xffffffffc0000076 in redirfs_get_filename ()
#7  0xffffffffc0014121 in dummyflt_release (context=<optimised out>, args=0xffffc90000227d98) at /work/redirfs/src/dummyflt/dummyflt.c:104
#8  0xffffffffc000892e in rfs_precall_flts ()
#9  0xffffffffc0002a42 in rfs_release ()
#10 0xffffffff81193a7a in __fput (file=0xffff88001cd02500) at ../fs/file_table.c:209
#11 0xffffffff81193bb9 in ____fput (work=<optimised out>) at ../fs/file_table.c:245
#12 0xffffffff810758b9 in task_work_run () at ../kernel/task_work.c:116
#13 0xffffffff8105da35 in exit_task_work (task=<optimised out>) at ../include/linux/task_work.h:21
#14 do_exit (code=<optimised out>) at ../kernel/exit.c:878
#15 0xffffffff8105f14e in do_group_exit (exit_code=0) at ../kernel/exit.c:982
#16 0xffffffff8105f1bf in SYSC_exit_group (error_code=<optimised out>) at ../kernel/exit.c:993
#17 SyS_exit_group (error_code=<optimised out>) at ../kernel/exit.c:991
#18 0xffffffff8193d060 in entry_SYSCALL_64 () at ../arch/x86/entry/entry_64.S:203
#19 0x0000000000000000 in ?? ()

Tuesday, March 7, 2017

RISC-V Linux memory regions on boot

This text is based on memory_areas_on_boot.md from my GitHub repo riscv-notes

On boot the kernel has the following memory areas required for code execution

vmlinux ELF code and data sections mapped by the bootloader
the page tables for virtual memory support created by the bootloader
initial stack

The pages used by the above regions must be marked as reserved so they are not used for memory allocations.

As shown here https://github.com/slavaim/riscv-notes/blob/master/linux/memory-initialization.md the kernel makes the following calls for memory reservation.

    memblock_reserve(info.base, __pa(_end) - info.base);
    reserve_boot_page_table(pfn_to_virt(csr_read(sptbr)));

The first call to memblock_reserve is to reserve the area from &_start to &_end , this area is defined in the following linker script.

SECTIONS
{
    /* Beginning of code and text segment */
    . = LOAD_OFFSET;
    _start = .;
    __init_begin = .;
    HEAD_TEXT_SECTION
    INIT_TEXT_SECTION(PAGE_SIZE)
    INIT_DATA_SECTION(16)
    /* we have to discard exit text and such at runtime, not link time */
    .exit.text :
    {
        EXIT_TEXT
    }
    .exit.data :
    {
        EXIT_DATA
    }
    PERCPU_SECTION(L1_CACHE_BYTES)
    __init_end = .;

    .text : {
        _text = .;
        _stext = .;
        TEXT_TEXT
        SCHED_TEXT
        LOCK_TEXT
        KPROBES_TEXT
        ENTRY_TEXT
        IRQENTRY_TEXT
        *(.fixup)
        _etext = .;
    }

    /* Start of data section */
    _sdata = .;
    RO_DATA_SECTION(PAGE_SIZE)
    RW_DATA_SECTION(0x40, PAGE_SIZE, THREAD_SIZE)
    .sdata : {
        _gp = . + 0x800;
        *(.sdata*)
    }
    .srodata : {
        *(.srodata*)
    }
    /* End of data section */
    _edata = .;

    BSS_SECTION(0x20, 0, 0x20)

    EXCEPTION_TABLE(0x10)
    NOTES

    .rel.dyn : {
        *(.rel.dyn*)
    }

    _end = .;

    STABS_DEBUG
    DWARF_DEBUG

    DISCARDS
}

As you can see this area encompasses all kernel code and data excluding debug information. This area starts at ffffffff80000000. You can easily find the start and end addresses from the System.map file. These values for my test kernel

ffffffff80000000 T _start
ffffffff803b10b4 R _end

The second call to reserve_boot_page_table reserves the initial page table pages.
Where is a stack reservation? The stack is reserved by the first call to memblock_reserve as the initial stack is allocated from the kernel data section. The initial stack is staically allocated as init_thread_union.stack . The init_thread_union has the following type definition in linux/linux-4.6.2/include/linux/sched.h

union thread_union {
    struct thread_info thread_info;
    unsigned long stack[THREAD_SIZE/sizeof(long)];
};

For my test kernel the address of the init_thread_union is again extracted from System.map as

ffffffff8035e000 D init_thread_union

As you can see it is in the range of the region [&_start,&_end) and is in the data section.
The stack register is set on boot by the kernel entry routine _start defined in linux/linux-4.6.2/arch/riscv/kernel/head.S

__INIT
ENTRY(_start)
...
    /* Initialize stack pointer */
    la sp, init_thread_union + THREAD_SIZE
    /* Initialize current task_struct pointer */
    la tp, init_task
 ...
 END(_start)

RISC-V Linux kernel memory initialization on boot.

This text is based on memory-initialization.md from my GitHub repo riscv-notes

The kernel is started with virtual memory initialized by machine level bootloader BBL. The more detailed description can be found in this document - supervisor_vm_init.md .

The kernel start offset is defined in linux/linux-4.6.2/arch/riscv/include/asm/page.h

/*
 * PAGE_OFFSET -- the first address of the first page of memory.
 * When not using MMU this corresponds to the first free page in
 * physical memory (aligned on a page boundary).
 */
#ifdef CONFIG_64BIT
#define PAGE_OFFSET     _AC(0xffffffff80000000,UL)
#else
#define PAGE_OFFSET     _AC(0xc0000000,UL)
#endif

BBL initializes virtual memory for supervisor mode, maps the Linux kernel at PAGE_OFFSET, sets sptbr register value to a root page table physical address, switches to the supervisor mode with $pc set to the entry point _start. BBL does this in enter_supervisor_mode function defined in riscv-tools/riscv-pk/machine/minit.c

void enter_supervisor_mode(void (*fn)(uintptr_t), uintptr_t stack)
{
  uintptr_t mstatus = read_csr(mstatus);
  mstatus = INSERT_FIELD(mstatus, MSTATUS_MPP, PRV_S);
  mstatus = INSERT_FIELD(mstatus, MSTATUS_MPIE, 0);
  write_csr(mstatus, mstatus);
  write_csr(mscratch, MACHINE_STACK_TOP() - MENTRY_FRAME_SIZE);
  write_csr(mepc, fn);
  write_csr(sptbr, (uintptr_t)root_page_table >> RISCV_PGSHIFT);
  asm volatile ("mv a0, %0; mv sp, %0; mret" : : "r" (stack));
  __builtin_unreachable();
}

The important difference between RISC-V case and many other CPUs( e.g. x86 )is that Linux kernel's entry point is called with virtual memory initialized by boot loader executing at higher privilege mode.

The memory management is initialized inside setup_arch routine defined in linux/linux-4.6.2/arch/riscv/kernel/setup.c, below only memory management relevant part of the function is shown

void __init setup_arch(char **cmdline_p)
{
...
    init_mm.start_code = (unsigned long) _stext;
    init_mm.end_code   = (unsigned long) _etext;
    init_mm.end_data   = (unsigned long) _edata;
    init_mm.brk        = (unsigned long) _end;

    setup_bootmem();
    ....
    paging_init();
    ....
}

The _stext, _etext, _edata, _end global variables are defined in the linker script linux/linux-4.6.2/arch/riscv/kernel/vmlinux.lds.S which defines the kernel memory layout. These variables defines the kernel section borders. The thorough description regarding linkers scripts can be found here https://sourceware.org/binutils/docs/ld/Scripts.html .

The first function being called is setup_bootmem

static void __init setup_bootmem(void)
{
    unsigned long ret;
    memory_block_info info;

    ret = sbi_query_memory(0, &info);
    BUG_ON(ret != 0);
    BUG_ON((info.base & ~PMD_MASK) != 0);
    BUG_ON((info.size & ~PMD_MASK) != 0);
    pr_info("Available physical memory: %ldMB\n", info.size >> 20);

    /* The kernel image is mapped at VA=PAGE_OFFSET and PA=info.base */
    va_pa_offset = PAGE_OFFSET - info.base;
    pfn_base = PFN_DOWN(info.base);

    if ((mem_size != 0) && (mem_size < info.size)) {
        memblock_enforce_memory_limit(mem_size);
        info.size = mem_size;
        pr_notice("Physical memory usage limited to %lluMB\n",
            (unsigned long long)(mem_size >> 20));
    }
    set_max_mapnr(PFN_DOWN(info.size));
    max_low_pfn = PFN_DOWN(info.base + info.size);

#ifdef CONFIG_BLK_DEV_INITRD
    setup_initrd();
#endif /* CONFIG_BLK_DEV_INITRD */

    memblock_reserve(info.base, __pa(_end) - info.base);
    reserve_boot_page_table(pfn_to_virt(csr_read(sptbr)));
    memblock_allow_resize();
}

The Linux kernel queries the available memory size in setup_bootmem by invoking SBI interface's sbi_query_memorywhich results in a call to __sbi_query_memory BBL routine executed (suprisingly) in supervisor mode as SBI has been mapped to the supervisor virtual address space and ecall instruction is not invoked for sbi_query_memory

uintptr_t __sbi_query_memory(uintptr_t id, memory_block_info *p)
{
  if (id == 0) {
    p->base = first_free_paddr;
    p->size = mem_size + DRAM_BASE - p->base;
    return 0;
  }

  return -1;
}

More about SBI can be found here https://github.com/slavaim/riscv-notes/blob/master/bbl/sbi-to-linux.md .
The kernel reserves the pages occupied by the kernel with a call to memblock_reserve(info.base, __pa(_end) - info.base); . Then a call to reserve_boot_page_table(pfn_to_virt(csr_read(sptbr))); reserves the pages occupied by the page table allocated by the bootloader, i.e. BBL.The Linux kernel retrieves the page table allocated and initialized by BBL by reading a physical address from the sptbr register and converting it to a virtual address. The page table virtual address is also saved at the master kernel Page Tables init_mm.pgd. The snippet is from linux/linux-4.6.2/arch/riscv/mm/init.c

void __init paging_init(void)
{
    init_mm.pgd = (pgd_t *)pfn_to_virt(csr_read(sptbr));
  ....
}

Sunday, March 5, 2017

RISC-V SBI mapping to Linux

This text is based on sbi-to-linux.md from my GitHub repo riscv-notes

The machine level SBI ( Supervisor Binary Interface ) is exported to the Linux kernel by mapping it at the top of the address space.

The mapping is performed by BBL in supervisor_vm_init defined in riscv-tools/riscv-pk/bbl/bbl.c.

  // map SBI at top of vaddr space
  extern char _sbi_end;
  uintptr_t num_sbi_pages = ((uintptr_t)&_sbi_end - DRAM_BASE - 1) / RISCV_PGSIZE + 1;
  assert(num_sbi_pages <= (1 << RISCV_PGLEVEL_BITS));
  for (uintptr_t i = 0; i < num_sbi_pages; i++) {
    uintptr_t idx = (1 << RISCV_PGLEVEL_BITS) - num_sbi_pages + i;
    sbi_pt[idx] = pte_create((DRAM_BASE / RISCV_PGSIZE) + i, PTE_G | PTE_R | PTE_X);
  }
  pte_t* sbi_pte = middle_pt + ((num_middle_pts << RISCV_PGLEVEL_BITS)-1);
  assert(!*sbi_pte);
  *sbi_pte = ptd_create((uintptr_t)sbi_pt >> RISCV_PGSHIFT);

You can read more on supervisor_vm_init here https://github.com/slavaim/riscv-notes/blob/master/bbl/supervisor_vm_init.md . From the code above you can see that the last page ending at _sbi_end physical address is mapped at the last page of the supervisor virtual address space.

The offsets to SBI entry points are defined in riscv-tools/riscv-pk/machine/sbi.S as

.globl sbi_hart_id; sbi_hart_id = -2048
.globl sbi_num_harts; sbi_num_harts = -2032
.globl sbi_query_memory; sbi_query_memory = -2016
.globl sbi_console_putchar; sbi_console_putchar = -2000
.globl sbi_console_getchar; sbi_console_getchar = -1984
.globl sbi_send_ipi; sbi_send_ipi = -1952
.globl sbi_clear_ipi; sbi_clear_ipi = -1936
.globl sbi_timebase; sbi_timebase = -1920
.globl sbi_shutdown; sbi_shutdown = -1904
.globl sbi_set_timer; sbi_set_timer = -1888
.globl sbi_mask_interrupt; sbi_mask_interrupt = -1872
.globl sbi_unmask_interrupt; sbi_unmask_interrupt = -1856
.globl sbi_remote_sfence_vm; sbi_remote_sfence_vm = -1840
.globl sbi_remote_sfence_vm_range; sbi_remote_sfence_vm_range = -1824
.globl sbi_remote_fence_i; sbi_remote_fence_i = -1808

These definitions are offsets from the top of the address space for the SBI trampoline stubs defined in riscv-tools/riscv-pk/machine/sbi_entry.S

  # hart_id
  .align 4
  li a7, MCALL_HART_ID
  ecall
  ret

  # num_harts
  .align 4
  lw a0, num_harts
  ret

  # query_memory
  .align 4
  tail __sbi_query_memory

The SBI trampoline stubs code start is defined as sbi_base and is aligned to a page boundary by align RISCV_PGSHIFTdirective. The first RISCV_PGSIZE - 2048 bytes are reserved by .skip RISCV_PGSIZE - 2048 directive so the first instruction starts at 2048 bytes offset from the page top defined as

.align RISCV_PGSHIFT
  .globl sbi_base
sbi_base:

  # TODO: figure out something better to do with this space.  It's not
  # protected from the OS, so beware.
  .skip RISCV_PGSIZE - 2048

The end of the section is also aligned at the page boundary and is defined as

  .align RISCV_PGSHIFT
  .globl _sbi_end
_sbi_end:

The SBI trampoline stubs section .sbi is placed at the end of BBL just before the payload by defining the layout in riscv-tools/riscv-pk/bbl/bbl.lds as

   .sbi :
  {
    *(.sbi)
  }

  .payload :
  {
    *(.payload)
  }

  _end = .;

So the supervisor_vm_init code that maps machine level physical addresses to supervisor virtuall addresses maps the BBL .sbi section which contains SBI trampoline stubs at the top of the supervisor virtual address space.

Linux kernel access SBI trampoline stubs by a call with offsets defined in linux/linux-4.6.2/arch/riscv/kernel/sbi.Swhich is a carbon copy of riscv-tools/riscv-pk/machine/sbi.S . For example a snippet from Linux kernel entry point _start defined in linux/linux-4.6.2/arch/riscv/kernel/head.S

    /* See if we're the main hart */
    call sbi_hart_id
    bnez a0, .Lsecondary_start

This code is translated by GCC to

   0xffffffff80000018 <+24>:    jalr    -2048(zero) # 0xfffffffffffff800
   0xffffffff8000001c <+28>:    bnez    a0,0xffffffff80000054 <_start+84>

The address 0xfffffffffffff800 is 2048 bytes offset from the top of the virtual address space last page. As we saw above this page is backed by a physical page with SBI trampoline stubs code starting at sbi_base machine level physical address. The dissassembling shows the SBI trampoline stubs at offsets

 0xfffffffffffff800 which is sbi_hart_id = -2048
 0xfffffffffffff810 which is sbi_num_harts = -2032
 0xfffffffffffff820 which is sbi_query_memory = -2016
 0xfffffffffffff830 which is sbi_console_putchar = -2000
 etc

(gdb) x/48i 0xfffffffffffff800
   0xfffffffffffff800:  li  a7,0
   0xfffffffffffff804:  ecall
   0xfffffffffffff808:  ret
   0xfffffffffffff80c:  nop
   0xfffffffffffff810:  auipc   a0,0xfffff
   0xfffffffffffff814:  lw  a0,-1888(a0)
   0xfffffffffffff818:  ret
   0xfffffffffffff81c:  nop
   0xfffffffffffff820:  j   0xffffffffffff92c0
   0xfffffffffffff824:  nop
   0xfffffffffffff828:  nop
   0xfffffffffffff82c:  nop
   0xfffffffffffff830:  li  a7,1
   0xfffffffffffff834:  ecall
   0xfffffffffffff838:  ret
   0xfffffffffffff83c:  nop
   0xfffffffffffff840:  li  a7,2
   0xfffffffffffff844:  ecall
   0xfffffffffffff848:  ret
   0xfffffffffffff84c:  nop
   0xfffffffffffff850:  unimp
   0xfffffffffffff854:  nop
   0xfffffffffffff858:  nop
   0xfffffffffffff85c:  nop
   0xfffffffffffff860:  li  a7,4
   0xfffffffffffff864:  ecall
   0xfffffffffffff868:  ret
   0xfffffffffffff86c:  nop
   0xfffffffffffff870:  li  a7,5
   0xfffffffffffff874:  ecall
   0xfffffffffffff878:  ret
   0xfffffffffffff87c:  nop
   0xfffffffffffff880:  lui a0,0x989
   0xfffffffffffff884:  addiw   a0,a0,1664
   0xfffffffffffff888:  ret
   0xfffffffffffff88c:  nop
   0xfffffffffffff890:  li  a7,6
   0xfffffffffffff894:  ecall
   0xfffffffffffff898:  nop
   0xfffffffffffff89c:  nop
   0xfffffffffffff8a0:  li  a7,7
   0xfffffffffffff8a4:  ecall
   0xfffffffffffff8a8:  ret
   0xfffffffffffff8ac:  nop
   0xfffffffffffff8b0:  j   0xffffffffffff92f8
   0xfffffffffffff8b4:  nop
   0xfffffffffffff8b8:  nop
   0xfffffffffffff8bc:  nop

As you can see not all SBI trampolines stubs invoke ecall system call to enter a higher privilege level, the machine level in this case. For example query_memory is just an unconditional jump to the SBI code mapped to the Linux kernel space.

 0xfffffffffffff820:    j   0xffffffffffff92c0

In that case the CPU doesn't switch to machine level and continues in the supervisor mode with virtual memory enabled. When CPU switches to the machine mode it disables virtual address translation and switches back to physical addresses. Below is a call stack when query_memory is called. A you can see the CPU continues with virtual address memory enabled and uses virtual addresses. The debugger was unable to resolve a call to query_memory in BBL as it was not aware about the code being remapped to the Linux system address space.

#0  0xffffffffffff92c8 in ?? ()
#1  0xffffffff80002c38 in setup_bootmem () at arch/riscv/kernel/setup.c:149
#2  setup_arch (cmdline_p=<optimized out>) at arch/riscv/kernel/setup.c:152
#3  0xffffffff80000898 in start_kernel () at init/main.c:500
#4  0xffffffff80000040 in _start () at arch/riscv/kernel/head.S:36

I guess that one of the possible reasons for such query_memory implementation is to simplify development as this function returns the structure which would require either packing it in registers or translating addresses either in the Linux kernel or in BBL SDI.

The call stack for sbi_hart_id looks differently

#0  0x0000000080000c90 in mcall_trap (regs=0x82660ec0, mcause=9, mepc=18446744073709549572) at ../machine/mtrap.c:210
#1  0x00000000800000ec in trap_vector () at ../machine/mentry.S:116
Backtrace stopped: frame did not save the PC
(gdb) p/x mepc
$4 = 0xfffffffffffff804

The virtual address translation is disabled and the CPU works with physical addresses. The debugger was unable to cross the boundary back to the Linux kernel stack that requires processing address space translation switching. As you can see the mpec register points to the ecall instruction virtual address in supervisor mode

   0xfffffffffffff800:  li  a7,0
   0xfffffffffffff804:  ecall
   0xfffffffffffff808:  ret

RISC-V BBL supervisor_vm_init

This text is based on supervisor_vm_init.md from my GitHub repo riscv-notes

The function builds page table structures to map RISC-V BBL payload to supervisor mode. The function operates in machine level physical address space. You should not be fooled by presence of supervisor virtual addresses as they are adjusted to machine level physical address before being accessed.

static void supervisor_vm_init()
{
  uintptr_t highest_va = DRAM_BASE - first_free_paddr;
  mem_size = MIN(mem_size, highest_va - info.first_user_vaddr) & -MEGAPAGE_SIZE;

  pte_t* sbi_pt = (pte_t*)(info.first_vaddr_after_user + info.load_offset);
  memset(sbi_pt, 0, RISCV_PGSIZE);
  pte_t* middle_pt = (void*)sbi_pt + RISCV_PGSIZE;
#if __riscv_xlen == 32
  size_t num_middle_pts = 1;
  pte_t* root_pt = middle_pt;
  memset(root_pt, 0, RISCV_PGSIZE);
#else
  size_t num_middle_pts = (-info.first_user_vaddr - 1) / GIGAPAGE_SIZE + 1;
  pte_t* root_pt = (void*)middle_pt + num_middle_pts * RISCV_PGSIZE;
  memset(middle_pt, 0, (num_middle_pts + 1) * RISCV_PGSIZE);
  for (size_t i = 0; i < num_middle_pts; i++)
    root_pt[(1<<RISCV_PGLEVEL_BITS)-num_middle_pts+i] = ptd_create(((uintptr_t)middle_pt >> RISCV_PGSHIFT) + i);
#endif

  for (uintptr_t vaddr = info.first_user_vaddr, paddr = vaddr + info.load_offset, end = info.first_vaddr_after_user;
       paddr < DRAM_BASE + mem_size; vaddr += MEGAPAGE_SIZE, paddr += MEGAPAGE_SIZE) {
    int l2_shift = RISCV_PGLEVEL_BITS + RISCV_PGSHIFT;
    size_t l2_idx = (info.first_user_vaddr >> l2_shift) & ((1 << RISCV_PGLEVEL_BITS)-1);
    l2_idx += ((vaddr - info.first_user_vaddr) >> l2_shift);
    middle_pt[l2_idx] = pte_create(paddr >> RISCV_PGSHIFT, PTE_G | PTE_R | PTE_W | PTE_X);
  }

  // map SBI at top of vaddr space
  extern char _sbi_end;
  uintptr_t num_sbi_pages = ((uintptr_t)&_sbi_end - DRAM_BASE - 1) / RISCV_PGSIZE + 1;
  assert(num_sbi_pages <= (1 << RISCV_PGLEVEL_BITS));
  for (uintptr_t i = 0; i < num_sbi_pages; i++) {
    uintptr_t idx = (1 << RISCV_PGLEVEL_BITS) - num_sbi_pages + i;
    sbi_pt[idx] = pte_create((DRAM_BASE / RISCV_PGSIZE) + i, PTE_G | PTE_R | PTE_X);
  }
  pte_t* sbi_pte = middle_pt + ((num_middle_pts << RISCV_PGLEVEL_BITS)-1);
  assert(!*sbi_pte);
  *sbi_pte = ptd_create((uintptr_t)sbi_pt >> RISCV_PGSHIFT);

  mb();
  root_page_table = root_pt;
  write_csr(sptbr, (uintptr_t)root_pt >> RISCV_PGSHIFT);
}

Lets look on this function code flow.

uintptr_t highest_va = DRAM_BASE - first_free_paddr;

The above operation calculates the highest supervisor VA(virtual address) highest_va value. DRAM_BASE is less than first_free_paddr which is the address of the first free megapage after BBL+payload was loaded to DRAM starting at DRAM_BASE machine level address. On my test system DRAM_BASE = 0x80000000 and first_free_paddr = 0x82800000these are machine level physical adresses as CPU starts at machine level mode. The difference is a negative number which in two's complement arithmetic gives the valid virtual address at the top of the 64 bit address range highest_va = 0xfffffffffd800000 for supervisor mode. This leaves intact a top VA range in supervider mode thus preserving the machine level code which is mapped at this range, see below.

The info structure describes the payload with an ELF header. Typical values on my system for the Linux kernel as a payload as they are shown by GDB print command are

(gdb) p/x info
$6 = {entry = 0xffffffff80000000, first_user_vaddr = 0xffffffff80000000, first_vaddr_after_user = 0xffffffff803b2000, load_offset = 0x102800000}

The memory size available for machine level mode is

(gdb) p/x mem_size
$16 = 0x100000000

It should be adjusted for supervisor. The memory size available for supervisor is calculated as

mem_size = MIN(mem_size, highest_va - info.first_user_vaddr) & -MEGAPAGE_SIZE;

On my system this value is

(gdb) p/x $a5
$11 = 0x7d800000

Then the SBI page table is allocated. This page table is used to map the SBI BBL at the top of the address space.

pte_t* sbi_pt = (pte_t*)(info.first_vaddr_after_user + info.load_offset);

The sbi_pt value on my machine is

(gdb) p/x $s1
$15 = 0x82bb2000

As you can see the CPU works with machine level addresses while info.first_vaddr_after_user is a supervisor virtual address. The info.load_offset value is used to adjust the supervisor virtual address to machine level physical address.

Then the real supervisor page table address is calculated by allocating a middle/directory table just after sbi_pt

pte_t* middle_pt = (void*)sbi_pt + RISCV_PGSIZE;

Then the root page table pointer is initialized.

#if __riscv_xlen == 32
  size_t num_middle_pts = 1;
  pte_t* root_pt = middle_pt;
  memset(root_pt, 0, RISCV_PGSIZE);
#else
  size_t num_middle_pts = (-info.first_user_vaddr - 1) / GIGAPAGE_SIZE + 1;
  pte_t* root_pt = (void*)middle_pt + num_middle_pts * RISCV_PGSIZE;
  memset(middle_pt, 0, (num_middle_pts + 1) * RISCV_PGSIZE);
  for (size_t i = 0; i < num_middle_pts; i++)
    root_pt[(1<<RISCV_PGLEVEL_BITS)-num_middle_pts+i] = ptd_create(((uintptr_t)middle_pt >> RISCV_PGSHIFT) + i);
#endif

The supervisor page table structure is then initialized to map supervisor virtual addresses to machine level physical addresses. Look how info.load_offset is used again to translate supervisor virtual address to machine level physical address.

  for (uintptr_t vaddr = info.first_user_vaddr, paddr = vaddr + info.load_offset, end = info.first_vaddr_after_user;
       paddr < DRAM_BASE + mem_size; vaddr += MEGAPAGE_SIZE, paddr += MEGAPAGE_SIZE) {
    int l2_shift = RISCV_PGLEVEL_BITS + RISCV_PGSHIFT;
    size_t l2_idx = (info.first_user_vaddr >> l2_shift) & ((1 << RISCV_PGLEVEL_BITS)-1);
    l2_idx += ((vaddr - info.first_user_vaddr) >> l2_shift);
    middle_pt[l2_idx] = pte_create(paddr >> RISCV_PGSHIFT, PTE_G | PTE_R | PTE_W | PTE_X);
  }

The machine level SBI BBL code is remapped at the top of the range reserved above highest_va through sbi_pt page table allocated early. The BBL has been loaded at DRAM_BASE machine level physical address. This address range is mapped as a read only range for supervisor mode. The PTE are also marked as global so they are visible in all address spaces.

  // map SBI at top of vaddr space
  extern char _sbi_end;
  uintptr_t num_sbi_pages = ((uintptr_t)&_sbi_end - DRAM_BASE - 1) / RISCV_PGSIZE + 1;
  assert(num_sbi_pages <= (1 << RISCV_PGLEVEL_BITS));
  for (uintptr_t i = 0; i < num_sbi_pages; i++) {
    uintptr_t idx = (1 << RISCV_PGLEVEL_BITS) - num_sbi_pages + i;
    sbi_pt[idx] = pte_create((DRAM_BASE / RISCV_PGSIZE) + i, PTE_G | PTE_R | PTE_X);
  }

After sbi_pt has been filled it is inserted in the superviser page directory. This establishes the mapping visible from supervisor level.

  pte_t* sbi_pte = middle_pt + ((num_middle_pts << RISCV_PGLEVEL_BITS)-1);
  assert(!*sbi_pte);
  *sbi_pte = ptd_create((uintptr_t)sbi_pt >> RISCV_PGSHIFT);

The last page ending at _sbi_end physical address is mapped at the last page of the virtual address space. SBI mapping in detailes is descibed here https://github.com/slavaim/riscv-notes/blob/master/bbl/sbi-to-linux.md

Before returning to a caller the function sets page table base register for supervisor virtual address translation. The memory barrier guaranties that all memory writes has completed so the page table is in a consistent state.

  mb();
  root_page_table = root_pt;
  write_csr(sptbr, (uintptr_t)root_pt >> RISCV_PGSHIFT);

P.S. BBL sets sptbr to the root_page_table value in enter_supervisor_mode which makes redundant the above call to write_csr(sptbr, (uintptr_t)root_pt >> RISCV_PGSHIFT);.