#define _GNU_SOURCE
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <dirent.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <linux/mman.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
#include "linux/magic.h"
#include "vm_util.h"
#include "thp_settings.h"
#define BASE_ADDR ((void *)(1UL << 30))
static unsigned long hpage_pmd_size;
static unsigned long page_size;
static int hpage_pmd_nr;
static int anon_order;
#define PID_SMAPS "/proc/self/smaps"
#define TEST_FILE "collapse_test_file"
#define MAX_LINE_LENGTH 500
enum vma_type {
VMA_ANON,
VMA_FILE,
VMA_SHMEM,
};
struct mem_ops {
void *(*setup_area)(int nr_hpages);
void (*cleanup_area)(void *p, unsigned long size);
void (*fault)(void *p, unsigned long start, unsigned long end);
bool (*check_huge)(void *addr, int nr_hpages);
const char *name;
};
static struct mem_ops *file_ops;
static struct mem_ops *anon_ops;
static struct mem_ops *shmem_ops;
struct collapse_context {
void (*collapse)(const char *msg, char *p, int nr_hpages,
struct mem_ops *ops, bool expect);
bool enforce_pte_scan_limits;
const char *name;
};
static struct collapse_context *khugepaged_context;
static struct collapse_context *madvise_context;
struct file_info {
const char *dir;
char path[PATH_MAX];
enum vma_type type;
int fd;
char dev_queue_read_ahead_path[PATH_MAX];
};
static struct file_info finfo;
static bool skip_settings_restore;
static int exit_status;
static void success(const char *msg)
{
printf(" \e[32m%s\e[0m\n", msg);
}
static void fail(const char *msg)
{
printf(" \e[31m%s\e[0m\n", msg);
exit_status++;
}
static void skip(const char *msg)
{
printf(" \e[33m%s\e[0m\n", msg);
}
static void restore_settings_atexit(void)
{
if (skip_settings_restore)
return;
printf("Restore THP and khugepaged settings...");
thp_restore_settings();
success("OK");
skip_settings_restore = true;
}
static void restore_settings(int sig)
{
/* exit() will invoke the restore_settings_atexit handler. */
exit(sig ? EXIT_FAILURE : exit_status);
}
static void save_settings(void)
{
printf("Save THP and khugepaged settings...");
if (file_ops && finfo.type == VMA_FILE)
thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
thp_save_settings();
success("OK");
atexit(restore_settings_atexit);
signal(SIGTERM, restore_settings);
signal(SIGINT, restore_settings);
signal(SIGHUP, restore_settings);
signal(SIGQUIT, restore_settings);
}
static void get_finfo(const char *dir)
{
struct stat path_stat;
struct statfs fs;
char buf[1 << 10];
char path[PATH_MAX];
char *str, *end;
finfo.dir = dir;
stat(finfo.dir, &path_stat);
if (!S_ISDIR(path_stat.st_mode)) {
printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
exit(EXIT_FAILURE);
}
if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
finfo.dir) >= sizeof(finfo.path)) {
printf("%s: Pathname is too long\n", __func__);
exit(EXIT_FAILURE);
}
if (statfs(finfo.dir, &fs)) {
perror("statfs()");
exit(EXIT_FAILURE);
}
finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
if (finfo.type == VMA_SHMEM)
return;
/* Find owning device's queue/read_ahead_kb control */
if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
major(path_stat.st_dev), minor(path_stat.st_dev))
>= sizeof(path)) {
printf("%s: Pathname is too long\n", __func__);
exit(EXIT_FAILURE);
}
if (read_file(path, buf, sizeof(buf)) < 0) {
perror("read_file(read_num)");
exit(EXIT_FAILURE);
}
if (strstr(buf, "DEVTYPE=disk")) {
/* Found it */
if (snprintf(finfo.dev_queue_read_ahead_path,
sizeof(finfo.dev_queue_read_ahead_path),
"/sys/dev/block/%d:%d/queue/read_ahead_kb",
major(path_stat.st_dev), minor(path_stat.st_dev))
>= sizeof(finfo.dev_queue_read_ahead_path)) {
printf("%s: Pathname is too long\n", __func__);
exit(EXIT_FAILURE);
}
return;
}
if (!strstr(buf, "DEVTYPE=partition")) {
printf("%s: Unknown device type: %s\n", __func__, path);
exit(EXIT_FAILURE);
}
/*
* Partition of block device - need to find actual device.
* Using naming convention that devnameN is partition of
* device devname.
*/
str = strstr(buf, "DEVNAME=");
if (!str) {
printf("%s: Could not read: %s", __func__, path);
exit(EXIT_FAILURE);
}
str += 8;
end = str;
while (*end) {
if (isdigit(*end)) {
*end = '\0';
if (snprintf(finfo.dev_queue_read_ahead_path,
sizeof(finfo.dev_queue_read_ahead_path),
"/sys/block/%s/queue/read_ahead_kb",
str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
printf("%s: Pathname is too long\n", __func__);
exit(EXIT_FAILURE);
}
return;
}
++end;
}
printf("%s: Could not read: %s\n", __func__, path);
exit(EXIT_FAILURE);
}
static bool check_swap(void *addr, unsigned long size)
{
bool swap = false;
int ret;
FILE *fp;
char buffer[MAX_LINE_LENGTH];
char addr_pattern[MAX_LINE_LENGTH];
ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
(unsigned long) addr);
if (ret >= MAX_LINE_LENGTH) {
printf("%s: Pattern is too long\n", __func__);
exit(EXIT_FAILURE);
}
fp = fopen(PID_SMAPS, "r");
if (!fp) {
printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
exit(EXIT_FAILURE);
}
if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
goto err_out;
ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
size >> 10);
if (ret >= MAX_LINE_LENGTH) {
printf("%s: Pattern is too long\n", __func__);
exit(EXIT_FAILURE);
}
/*
* Fetch the Swap: in the same block and check whether it got
* the expected number of hugeepages next.
*/
if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
goto err_out;
if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
goto err_out;
swap = true;
err_out:
fclose(fp);
return swap;
}
static void *alloc_mapping(int nr)
{
void *p;
p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (p != BASE_ADDR) {
printf("Failed to allocate VMA at %p\n", BASE_ADDR);
exit(EXIT_FAILURE);
}
return p;
}
static void fill_memory(int *p, unsigned long start, unsigned long end)
{
int i;
for (i = start / page_size; i < end / page_size; i++)
p[i * page_size / sizeof(*p)] = i + 0xdead0000;
}
/*
* MADV_COLLAPSE is a best-effort request and may fail if an internal
* resource is temporarily unavailable, in which case it will set errno to
* EAGAIN. In such a case, immediately reattempt the operation one more
* time.
*/
static int madvise_collapse_retry(void *p, unsigned long size)
{
bool retry = true;
int ret;
retry:
ret = madvise(p, size, MADV_COLLAPSE);
if (ret && errno == EAGAIN && retry) {
retry = false;
goto retry;
}
return ret;
}
/*
* Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
* validate_memory()'able contents.
*/
static void *alloc_hpage(struct mem_ops *ops)
{
void *p = ops->setup_area(1);
ops->fault(p, 0, hpage_pmd_size);
/*
* VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
* The latter is ineligible for collapse by MADV_COLLAPSE
* while the former might cause MADV_COLLAPSE to race with
* khugepaged on low-load system (like a test machine), which
* would cause MADV_COLLAPSE to fail with EAGAIN.
*/
printf("Allocate huge page...");
if (madvise_collapse_retry(p, hpage_pmd_size)) {
perror("madvise(MADV_COLLAPSE)");
exit(EXIT_FAILURE);
}
if (!ops->check_huge(p, 1)) {
perror("madvise(MADV_COLLAPSE)");
exit(EXIT_FAILURE);
}
if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
perror("madvise(MADV_HUGEPAGE)");
exit(EXIT_FAILURE);
}
success("OK");
return p;
}
static void validate_memory(int *p, unsigned long start, unsigned long end)
{
int i;
for (i = start / page_size; i < end / page_size; i++) {
if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
printf("Page %d is corrupted: %#x\n",
i, p[i * page_size / sizeof(*p)]);
exit(EXIT_FAILURE);
}
}
}
static void *anon_setup_area(int nr_hpages)
{
return alloc_mapping(nr_hpages);
}
static void anon_cleanup_area(void *p, unsigned long size)
{
munmap(p, size);
}
static void anon_fault(void *p, unsigned long start, unsigned long end)
{
fill_memory(p, start, end);
}
static bool anon_check_huge(void *addr, int nr_hpages)
{
return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
}
static void *file_setup_area(int nr_hpages)
{
int fd;
void *p;
unsigned long size;
unlink(finfo.path); /* Cleanup from previous failed tests */
printf("Creating %s for collapse%s...", finfo.path,
finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
777);
if (fd < 0) {
perror("open()");
exit(EXIT_FAILURE);
}
size = nr_hpages * hpage_pmd_size;
p = alloc_mapping(nr_hpages);
fill_memory(p, 0, size);
write(fd, p, size);
close(fd);
munmap(p, size);
success("OK");
printf("Opening %s read only for collapse...", finfo.path);
finfo.fd = open(finfo.path, O_RDONLY, 777);
if (finfo.fd < 0) {
perror("open()");
exit(EXIT_FAILURE);
}
p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
MAP_PRIVATE, finfo.fd, 0);
if (p == MAP_FAILED || p != BASE_ADDR) {
perror("mmap()");
exit(EXIT_FAILURE);
}
/* Drop page cache */
write_file("/proc/sys/vm/drop_caches", "3", 2);
success("OK");
return p;
}
static void file_cleanup_area(void *p, unsigned long size)
{
munmap(p, size);
close(finfo.fd);
unlink(finfo.path);
}
static void file_fault(void *p, unsigned long start, unsigned long end)
{
if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
perror("madvise(MADV_POPULATE_READ");
exit(EXIT_FAILURE);
}
}
static bool file_check_huge(void *addr, int nr_hpages)
{
switch (finfo.type) {
case VMA_FILE:
return check_huge_file(addr, nr_hpages, hpage_pmd_size);
case VMA_SHMEM:
return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
default:
exit(EXIT_FAILURE);
return false;
}
}
static void *shmem_setup_area(int nr_hpages)
{
void *p;
unsigned long size = nr_hpages * hpage_pmd_size;
finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
if (finfo.fd < 0) {
perror("memfd_create()");
exit(EXIT_FAILURE);
}
if (ftruncate(finfo.fd, size)) {
perror("ftruncate()");
exit(EXIT_FAILURE);
}
p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
0);
if (p != BASE_ADDR) {
perror("mmap()");
exit(EXIT_FAILURE);
}
return p;
}
static void shmem_cleanup_area(void *p, unsigned long size)
{
munmap(p, size);
close(finfo.fd);
}
static bool shmem_check_huge(void *addr, int nr_hpages)
{
return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
}
static struct mem_ops __anon_ops = {
.setup_area = &anon_setup_area,
.cleanup_area = &anon_cleanup_area,
.fault = &anon_fault,
.check_huge = &anon_check_huge,
.name = "anon",
};
static struct mem_ops __file_ops = {
.setup_area = &file_setup_area,
.cleanup_area = &file_cleanup_area,
.fault = &file_fault,
.check_huge = &file_check_huge,
.name = "file",
};
static struct mem_ops __shmem_ops = {
.setup_area = &shmem_setup_area,
.cleanup_area = &shmem_cleanup_area,
.fault = &anon_fault,
.check_huge = &shmem_check_huge,
.name = "shmem",
};
static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
struct mem_ops *ops, bool expect)
{
int ret;
struct thp_settings settings = *thp_current_settings();
printf("%s...", msg);
/*
* Prevent khugepaged interference and tests that MADV_COLLAPSE
* ignores /sys/kernel/mm/transparent_hugepage/enabled
*/
settings.thp_enabled = THP_NEVER;
settings.shmem_enabled = SHMEM_NEVER;
thp_push_settings(&settings);
/* Clear VM_NOHUGEPAGE */
madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
if (((bool)ret) == expect)
fail("Fail: Bad return value");
else if (!ops->check_huge(p, expect ? nr_hpages : 0))
fail("Fail: check_huge()");
else
success("OK");
thp_pop_settings();
}
static void madvise_collapse(const char *msg, char *p, int nr_hpages,
struct mem_ops *ops, bool expect)
{
/* Sanity check */
if (!ops->check_huge(p, 0)) {
printf("Unexpected huge page\n");
exit(EXIT_FAILURE);
}
__madvise_collapse(msg, p, nr_hpages, ops, expect);
}
#define TICK 500000
static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
struct mem_ops *ops)
{
int full_scans;
int timeout = 6; /* 3 seconds */
/* Sanity check */
if (!ops->check_huge(p, 0)) {
printf("Unexpected huge page\n");
exit(EXIT_FAILURE);
}
madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
/* Wait until the second full_scan completed */
full_scans = thp_read_num("khugepaged/full_scans") + 2;
printf("%s...", msg);
while (timeout--) {
if (ops->check_huge(p, nr_hpages))
break;
if (thp_read_num("khugepaged/full_scans") >= full_scans)
break;
printf(".");
usleep(TICK);
}
madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
return timeout == -1;
}
static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
struct mem_ops *ops, bool expect)
{
if (wait_for_scan(msg, p, nr_hpages, ops)) {
if (expect)
fail("Timeout");
else
success("OK");
return;
}
/*
* For file and shmem memory, khugepaged only retracts pte entries after
* putting the new hugepage in the page cache. The hugepage must be
* subsequently refaulted to install the pmd mapping for the mm.
*/
if (ops != &__anon_ops)
ops->fault(p, 0, nr_hpages * hpage_pmd_size);
if (ops->check_huge(p, expect ? nr_hpages : 0))
success("OK");
else
fail("Fail");
}
static struct collapse_context __khugepaged_context = {
.collapse = &khugepaged_collapse,
.enforce_pte_scan_limits = true,
.name = "khugepaged",
};
static struct collapse_context __madvise_context = {
.collapse = &madvise_collapse,
.enforce_pte_scan_limits = false,
.name = "madvise",
};
static bool is_tmpfs(struct mem_ops *ops)
{
return ops == &__file_ops && finfo.type == VMA_SHMEM;
}
static bool is_anon(struct mem_ops *ops)
{
return ops == &__anon_ops;
}
static void alloc_at_fault(void)
{
struct thp_settings settings = *thp_current_settings();
char *p;
settings.thp_enabled = THP_ALWAYS;
thp_push_settings(&settings);
p = alloc_mapping(1);
*p = 1;
printf("Allocate huge page on fault...");
if (check_huge_anon(p, 1, hpage_pmd_size))
success("OK");
else
fail("Fail");
thp_pop_settings();
madvise(p, page_size, MADV_DONTNEED);
printf("Split huge PMD on MADV_DONTNEED...");
if (check_huge_anon(p, 0, hpage_pmd_size))
success("OK");
else
fail("Fail");
munmap(p, hpage_pmd_size);
}
static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
{
void *p;
int nr_hpages = 4;
unsigned long size = nr_hpages * hpage_pmd_size;
p = ops->setup_area(nr_hpages);
ops->fault(p, 0, size);
c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
ops, true);
validate_memory(p, 0, size);
ops->cleanup_area(p, size);
}
static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
{
void *p;
p = ops->setup_area(1);
c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
{
void *p;
p = ops->setup_area(1);
ops->fault(p, 0, page_size);
c->collapse("Collapse PTE table with single PTE entry present", p,
1, ops, true);
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
{
int max_ptes_none = hpage_pmd_nr / 2;
struct thp_settings settings = *thp_current_settings();
void *p;
int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1;
settings.khugepaged.max_ptes_none = max_ptes_none;
thp_push_settings(&settings);
p = ops->setup_area(1);
if (is_tmpfs(ops)) {
/* shmem pages always in the page cache */
printf("tmpfs...");
skip("Skip");
goto skip;
}
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
ops, !c->enforce_pte_scan_limits);
validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
if (c->enforce_pte_scan_limits) {
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
true);
validate_memory(p, 0,
(hpage_pmd_nr - max_ptes_none) * page_size);
}
skip:
ops->cleanup_area(p, hpage_pmd_size);
thp_pop_settings();
}
static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
{
void *p;
p = ops->setup_area(1);
ops->fault(p, 0, hpage_pmd_size);
printf("Swapout one page...");
if (madvise(p, page_size, MADV_PAGEOUT)) {
perror("madvise(MADV_PAGEOUT)");
exit(EXIT_FAILURE);
}
if (check_swap(p, page_size)) {
success("OK");
} else {
fail("Fail");
goto out;
}
c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
true);
validate_memory(p, 0, hpage_pmd_size);
out:
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
{
int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap");
void *p;
p = ops->setup_area(1);
ops->fault(p, 0, hpage_pmd_size);
printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
perror("madvise(MADV_PAGEOUT)");
exit(EXIT_FAILURE);
}
if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
success("OK");
} else {
fail("Fail");
goto out;
}
c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
!c->enforce_pte_scan_limits);
validate_memory(p, 0, hpage_pmd_size);
if (c->enforce_pte_scan_limits) {
ops->fault(p, 0, hpage_pmd_size);
printf("Swapout %d of %d pages...", max_ptes_swap,
hpage_pmd_nr);
if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
perror("madvise(MADV_PAGEOUT)");
exit(EXIT_FAILURE);
}
if (check_swap(p, max_ptes_swap * page_size)) {
success("OK");
} else {
fail("Fail");
goto out;
}
c->collapse("Collapse with max_ptes_swap pages swapped out", p,
1, ops, true);
validate_memory(p, 0, hpage_pmd_size);
}
out:
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
{
void *p;
p = alloc_hpage(ops);
if (is_tmpfs(ops)) {
/* MADV_DONTNEED won't evict tmpfs pages */
printf("tmpfs...");
skip("Skip");
goto skip;
}
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
printf("Split huge page leaving single PTE mapping compound page...");
madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
c->collapse("Collapse PTE table with single PTE mapping compound page",
p, 1, ops, true);
validate_memory(p, 0, page_size);
skip:
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
{
void *p;
p = alloc_hpage(ops);
printf("Split huge page leaving single PTE page table full of compound pages...");
madvise(p, page_size, MADV_NOHUGEPAGE);
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
true);
validate_memory(p, 0, hpage_pmd_size);
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
{
void *p;
int i;
p = ops->setup_area(1);
for (i = 0; i < hpage_pmd_nr; i++) {
printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
i + 1, hpage_pmd_nr);
madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
ops->fault(BASE_ADDR, 0, hpage_pmd_size);
if (!ops->check_huge(BASE_ADDR, 1)) {
printf("Failed to allocate huge page\n");
exit(EXIT_FAILURE);
}
madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
p = mremap(BASE_ADDR - i * page_size,
i * page_size + hpage_pmd_size,
(i + 1) * page_size,
MREMAP_MAYMOVE | MREMAP_FIXED,
BASE_ADDR + 2 * hpage_pmd_size);
if (p == MAP_FAILED) {
perror("mremap+unmap");
exit(EXIT_FAILURE);
}
p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
(i + 1) * page_size,
(i + 1) * page_size + hpage_pmd_size,
MREMAP_MAYMOVE | MREMAP_FIXED,
BASE_ADDR - (i + 1) * page_size);
if (p == MAP_FAILED) {
perror("mremap+alloc");
exit(EXIT_FAILURE);
}
}
ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
ops->fault(p, 0, hpage_pmd_size);
if (!ops->check_huge(p, 1))
success("OK");
else
fail("Fail");
c->collapse("Collapse PTE table full of different compound pages", p, 1,
ops, true);
validate_memory(p, 0, hpage_pmd_size);
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
{
int wstatus;
void *p;
p = ops->setup_area(1);
printf("Allocate small page...");
ops->fault(p, 0, page_size);
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
printf("Share small page over fork()...");
if (!fork()) {
/* Do not touch settings on child exit */
skip_settings_restore = true;
exit_status = 0;
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
ops->fault(p, page_size, 2 * page_size);
c->collapse("Collapse PTE table with single page shared with parent process",
p, 1, ops, true);
validate_memory(p, 0, page_size);
ops->cleanup_area(p, hpage_pmd_size);
exit(exit_status);
}
wait(&wstatus);
exit_status += WEXITSTATUS(wstatus);
printf("Check if parent still has small page...");
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
validate_memory(p, 0, page_size);
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
{
int wstatus;
void *p;
p = alloc_hpage(ops);
printf("Share huge page over fork()...");
if (!fork()) {
/* Do not touch settings on child exit */
skip_settings_restore = true;
exit_status = 0;
if (ops->check_huge(p, 1))
success("OK");
else
fail("Fail");
printf("Split huge page PMD in child process...");
madvise(p, page_size, MADV_NOHUGEPAGE);
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
ops->fault(p, 0, page_size);
thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
c->collapse("Collapse PTE table full of compound pages in child",
p, 1, ops, true);
thp_write_num("khugepaged/max_ptes_shared",
thp_current_settings()->khugepaged.max_ptes_shared);
validate_memory(p, 0, hpage_pmd_size);
ops->cleanup_area(p, hpage_pmd_size);
exit(exit_status);
}
wait(&wstatus);
exit_status += WEXITSTATUS(wstatus);
printf("Check if parent still has huge page...");
if (ops->check_huge(p, 1))
success("OK");
else
fail("Fail");
validate_memory(p, 0, hpage_pmd_size);
ops->cleanup_area(p, hpage_pmd_size);
}
static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
{
int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
int wstatus;
void *p;
p = alloc_hpage(ops);
printf("Share huge page over fork()...");
if (!fork()) {
/* Do not touch settings on child exit */
skip_settings_restore = true;
exit_status = 0;
if (ops->check_huge(p, 1))
success("OK");
else
fail("Fail");
printf("Trigger CoW on page %d of %d...",
hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1, ops, !c->enforce_pte_scan_limits);
if (c->enforce_pte_scan_limits) {
printf("Trigger CoW on page %d of %d...",
hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
page_size);
if (ops->check_huge(p, 0))
success("OK");
else
fail("Fail");
c->collapse("Collapse with max_ptes_shared PTEs shared",
p, 1, ops, true);
}
validate_memory(p, 0, hpage_pmd_size);
ops->cleanup_area(p, hpage_pmd_size);
exit(exit_status);
}
wait(&wstatus);
exit_status += WEXITSTATUS(wstatus);
printf("Check if parent still has huge page...");
if (ops->check_huge(p, 1))
success("OK");
else
fail("Fail");
validate_memory(p, 0, hpage_pmd_size);
ops->cleanup_area(p, hpage_pmd_size);
}
static void madvise_collapse_existing_thps(struct collapse_context *c,
struct mem_ops *ops)
{
void *p;
p = ops->setup_area(1);
ops->fault(p, 0, hpage_pmd_size);
c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
validate_memory(p, 0, hpage_pmd_size);
/* c->collapse() will find a hugepage and complain - call directly. */
__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
validate_memory(p, 0, hpage_pmd_size);
ops->cleanup_area(p, hpage_pmd_size);
}
/*
* Test race with khugepaged where page tables have been retracted and
* pmd cleared.
*/
static void madvise_retracted_page_tables(struct collapse_context *c,
struct mem_ops *ops)
{
void *p;
int nr_hpages = 1;
unsigned long size = nr_hpages * hpage_pmd_size;
p = ops->setup_area(nr_hpages);
ops->fault(p, 0, size);
/* Let khugepaged collapse and leave pmd cleared */
if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
ops)) {
fail("Timeout");
return;
}
success("OK");
c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
true);
validate_memory(p, 0, size);
ops->cleanup_area(p, size);
}
static void usage(void)
{
fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n");
fprintf(stderr, "\n\tSupported Options:\n");
fprintf(stderr, "\t\t-h: This help message.\n");
fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n");
fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n");
exit(1);
}
static void parse_test_type(int argc, char **argv)
{
int opt;
char *buf;
const char *token;
while ((opt = getopt(argc, argv, "s:h")) != -1) {
switch (opt) {
case 's':
anon_order = atoi(optarg);
break;
case 'h':
default:
usage();
}
}
argv += optind;
argc -= optind;
if (argc == 0) {
/* Backwards compatibility */
khugepaged_context = &__khugepaged_context;
madvise_context = &__madvise_context;
anon_ops = &__anon_ops;
return;
}
buf = strdup(argv[0]);
token = strsep(&buf, ":");
if (!strcmp(token, "all")) {
khugepaged_context = &__khugepaged_context;
madvise_context = &__madvise_context;
} else if (!strcmp(token, "khugepaged")) {
khugepaged_context = &__khugepaged_context;
} else if (!strcmp(token, "madvise")) {
madvise_context = &__madvise_context;
} else {
usage();
}
if (!buf)
usage();
if (!strcmp(buf, "all")) {
file_ops = &__file_ops;
anon_ops = &__anon_ops;
shmem_ops = &__shmem_ops;
} else if (!strcmp(buf, "anon")) {
anon_ops = &__anon_ops;
} else if (!strcmp(buf, "file")) {
file_ops = &__file_ops;
} else if (!strcmp(buf, "shmem")) {
shmem_ops = &__shmem_ops;
} else {
usage();
}
if (!file_ops)
return;
if (argc != 2)
usage();
get_finfo(argv[1]);
}
int main(int argc, char **argv)
{
int hpage_pmd_order;
struct thp_settings default_settings = {
.thp_enabled = THP_MADVISE,
.thp_defrag = THP_DEFRAG_ALWAYS,
.shmem_enabled = SHMEM_ADVISE,
.use_zero_page = 0,
.khugepaged = {
.defrag = 1,
.alloc_sleep_millisecs = 10,
.scan_sleep_millisecs = 10,
},
/*
* When testing file-backed memory, the collapse path
* looks at how many pages are found in the page cache, not
* what pages are mapped. Disable read ahead optimization so
* pages don't find their way into the page cache unless
* we mem_ops->fault() them in.
*/
.read_ahead_kb = 0,
};
parse_test_type(argc, argv);
setbuf(stdout, NULL);
page_size = getpagesize();
hpage_pmd_size = read_pmd_pagesize();
if (!hpage_pmd_size) {
printf("Reading PMD pagesize failed");
exit(EXIT_FAILURE);
}
hpage_pmd_nr = hpage_pmd_size / page_size;
hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);
default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;
save_settings();
thp_push_settings(&default_settings);
alloc_at_fault();
#define TEST(t, c, o) do { \
if (c && o) { \
printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
t(c, o); \
} \
} while (0)
TEST(collapse_full, khugepaged_context, anon_ops);
TEST(collapse_full, khugepaged_context, file_ops);
TEST(collapse_full, khugepaged_context, shmem_ops);
TEST(collapse_full, madvise_context, anon_ops);
TEST(collapse_full, madvise_context, file_ops);
TEST(collapse_full, madvise_context, shmem_ops);
TEST(collapse_empty, khugepaged_context, anon_ops);
TEST(collapse_empty, madvise_context, anon_ops);
TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
TEST(collapse_single_pte_entry, madvise_context, anon_ops);
TEST(collapse_single_pte_entry, madvise_context, file_ops);
TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
TEST(collapse_max_ptes_none, madvise_context, anon_ops);
TEST(collapse_max_ptes_none, madvise_context, file_ops);
TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
TEST(collapse_full_of_compound, khugepaged_context, file_ops);
TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
TEST(collapse_full_of_compound, madvise_context, anon_ops);
TEST(collapse_full_of_compound, madvise_context, file_ops);
TEST(collapse_full_of_compound, madvise_context, shmem_ops);
TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
TEST(collapse_compound_extreme, madvise_context, anon_ops);
TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
TEST(collapse_fork, khugepaged_context, anon_ops);
TEST(collapse_fork, madvise_context, anon_ops);
TEST(collapse_fork_compound, khugepaged_context, anon_ops);
TEST(collapse_fork_compound, madvise_context, anon_ops);
TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
TEST(madvise_retracted_page_tables, madvise_context, file_ops);
TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
restore_settings(0);
}