chromium/chromeos/ash/components/memory/userspace_swap/userspace_swap.cc

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chromeos/ash/components/memory/userspace_swap/userspace_swap.h"

#include <atomic>
#include <cstdint>
#include <functional>
#include <map>
#include <optional>
#include <random>
#include <set>
#include <vector>

#include "base/feature_list.h"
#include "base/files/file_util.h"
#include "base/memory/page_size.h"
#include "base/metrics/field_trial_params.h"
#include "base/process/process_handle.h"
#include "base/rand_util.h"
#include "base/ranges/algorithm.h"
#include "base/time/time.h"
#include "build/build_config.h"
#include "chromeos/ash/components/memory/aligned_memory.h"
#include "chromeos/ash/components/memory/pagemap.h"
#include "chromeos/ash/components/memory/userspace_swap/region.h"
#include "chromeos/ash/components/memory/userspace_swap/swap_storage.h"
#include "chromeos/ash/components/memory/userspace_swap/userfaultfd.h"
#include "chromeos/ash/components/memory/userspace_swap/userspace_swap.mojom-forward.h"
#include "chromeos/ash/components/memory/userspace_swap/userspace_swap.mojom.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "partition_alloc/address_pool_manager.h"
#include "partition_alloc/buildflags.h"
#include "partition_alloc/partition_address_space.h"
#include "partition_alloc/partition_alloc_constants.h"
#include "services/resource_coordinator/public/cpp/memory_instrumentation/os_metrics.h"
#include "third_party/abseil-cpp/absl/utility/utility.h"

namespace ash {
namespace memory {
namespace userspace_swap {

namespace {

using memory_instrumentation::mojom::VmRegion;

// NOTE: Descriptions for these feature params can be found in the userspace
// swap header file for the UserspaceSwapConfig struct.
BASE_FEATURE(kUserspaceSwap,
             "UserspaceSwapEnabled",
             base::FEATURE_DISABLED_BY_DEFAULT);
const base::FeatureParam<int> kUserspaceSwapPagesPerRegion = {
    &kUserspaceSwap, "UserspaceSwapPagesPerRegion", 16};
const base::FeatureParam<bool> kUserspaceSwapCompressedSwapFile = {
    &kUserspaceSwap, "UserspaceSwapCompressedSwapFile", true};
const base::FeatureParam<int> kUserspaceSwapMinSwapDeviceSpaceAvailMB = {
    &kUserspaceSwap, "UserspaceSwapMinSwapDeviceSpaceAvailMB", 128};
const base::FeatureParam<int> kUserspaceSwapMaximumSwapDiskSpaceMB = {
    &kUserspaceSwap, "UserspaceSwapMaximumSwapSpaceMB", 1024};
const base::FeatureParam<int> kUserspaceSwapRendererMaximumSwapDiskSpaceMB = {
    &kUserspaceSwap, "UserspaceSwapRendererMaximumSwapSpaceMB", 128};
const base::FeatureParam<int> kUserspaceSwapRendererRegionLimitPerSwap = {
    &kUserspaceSwap, "UserspaceSwapRendererRegionLimitPerSwap", 100};
const base::FeatureParam<int> kUserspaceSwapBlockedRefaultTimeSec = {
    &kUserspaceSwap, "UserspaceSwapBlockedRefaultTimeSec", 45};
const base::FeatureParam<int>
    kUserspaceSwapModeratePressureGraphWalkFrequencySec = {
        &kUserspaceSwap, "UserspaceSwapModeratePressureGraphWalkFrequencySec",
        60};
const base::FeatureParam<int> kUserspaceSwapProcessSwapFrequencySec = {
    &kUserspaceSwap, "UserspaceSwapProcessSwapFrequencySec", 120};
const base::FeatureParam<int> kUserspaceSwapInvisibleTimeBeforeSwapSec = {
    &kUserspaceSwap, "UserspaceSwapInvisibleTimeBeforeSwapSec", 60};
const base::FeatureParam<bool> kUserspaceDoSwapModeratePressure = {
    &kUserspaceSwap, "UserspaceSwapDoSwapOnModeratePressure", true};
const base::FeatureParam<bool> kUserspaceDoSwapOnFreeze = {
    &kUserspaceSwap, "UserspaceSwapDoSwapOnFreeze", true};
const base::FeatureParam<bool> kUserspaceSwapShuffleMapsOrder = {
    &kUserspaceSwap, "UserspaceSwapSuffleMapsOrder", true};

// g_global_disk_usage is the sum of all |written_to_disk| values from each
// renderer. We keep track of this number because we need to enforce the global
// total swap limit. This value is safe to be fetched from any sequence.
std::atomic<uint64_t> g_global_disk_usage_bytes{0};

// This is the sum of all |reclaimed_bytes| values from each renderer. This
// value is safe to be fetched from any sequence.
std::atomic<uint64_t> g_global_reclaimed_bytes{0};

// This is our global swap kill switch.
std::atomic<bool> g_global_swap_allowed{true};

class RendererSwapDataImpl : public RendererSwapData {
 public:
  RendererSwapDataImpl(
      int render_process_host_id,
      base::ProcessId pid,
      std::unique_ptr<UserfaultFD> uffd,
      std::unique_ptr<SwapFile> swap_file,
      const Region& swap_remap_area,
      mojo::PendingRemote<::userspace_swap::mojom::UserspaceSwap>
          pending_remote)
      : render_process_host_id_(render_process_host_id),
        pid_(pid),
        uffd_(std::move(uffd)),
        swap_file_(std::move(swap_file)),
        remote_(std::move(pending_remote)) {
    InitializeSwapRemapAreas(swap_remap_area);
    swap_allowed_ = true;
    VLOG(1) << "Created RendererSwapDataImpl for rphid: "
            << render_process_host_id
            << " Swap Remap Area: " << swap_remap_area;
  }

  ~RendererSwapDataImpl() override;

  // RendererSwapData impl:
  int render_process_host_id() const override;
  bool SwapAllowed() const override;
  void DisallowSwap() override;
  uint64_t SwapDiskspaceWrittenBytes() const override;
  uint64_t SwapDiskspaceUsedBytes() const override;
  uint64_t ReclaimedBytes() const override;

  // Account/UnaccountSwapSpace update all counters both for this renderer and
  // globally to reflect the swapped space.
  void AccountSwapSpace(int64_t reclaimed, int64_t swap_size);
  void UnaccountSwapSpace(int64_t reclaimed, int64_t swap_size);

  // Break up the large region which a renderer mmap'ed as PROT_NONE into
  // discrete chunks which can be used for the destinations of an
  // MREMAP_DONTUNMAP.
  void InitializeSwapRemapAreas(const Region& swap_remap_area);

  // AllocFromSwapRegion will find a region which can be used as a destination
  // for a call to MovePTEs. This makes swapping easier, because now we just
  // wait to observe the remap event as our indicator that we can read the
  // memory from the process.
  std::optional<Region> AllocFromSwapRegion();
  void DeallocFromSwapRegion(const Region& region);

  // Swap at most |size_limit| bytes worth of memory on this renderer.
  bool Swap(size_t size_limit);

 private:
  void OnReceivedPASuperPages(
      size_t size_limit_bytes,
      std::vector<::userspace_swap::mojom::MemoryRegionPtr> regions);

  // Convert a mojo MemoryRegionPtr into a vector of userspace swap
  // |resident_regions| (which are of a configurable size) and which are fully
  // resident.
  void PASuperPagesToResidentRegions(
      const Pagemap& pagemap,
      const std::vector<::userspace_swap::mojom::MemoryRegionPtr>& regions,
      std::vector<Region>& resident_regions);

  const int render_process_host_id_;
  const base::ProcessId pid_;

  bool swap_allowed_ = false;

  uint64_t on_disk_bytes_ = 0;
  uint64_t reclaimed_bytes_ = 0;

  // Areas which can be used for moving PTEs.
  std::stack<Region> free_swap_dest_areas_;

  std::unique_ptr<UserfaultFD> uffd_;
  std::unique_ptr<SwapFile> swap_file_;

  // The remote is our link to the renderer to perform the operations it needs
  // to allow for swapping a region.
  mojo::Remote<::userspace_swap::mojom::UserspaceSwap> remote_;

  base::WeakPtrFactory<RendererSwapDataImpl> weak_factory_{this};
};

RendererSwapDataImpl::~RendererSwapDataImpl() = default;

int RendererSwapDataImpl::render_process_host_id() const {
  return render_process_host_id_;
}

void RendererSwapDataImpl::DisallowSwap() {
  swap_allowed_ = false;
}

bool RendererSwapDataImpl::SwapAllowed() const {
  return swap_allowed_ && IsSwapAllowedGlobally();
}

uint64_t RendererSwapDataImpl::SwapDiskspaceWrittenBytes() const {
  return on_disk_bytes_;
}

void RendererSwapDataImpl::InitializeSwapRemapAreas(
    const Region& swap_remap_area) {
  // Break the remap area up into region sized chunks which will be used as
  // the destination of MREMAP_DONTUNMAPs.
  const uint64_t kPagesPerRegion =
      UserspaceSwapConfig::Get().number_of_pages_per_region;
  const size_t kRegionSizeBytes = base::GetPageSize() * kPagesPerRegion;

  for (uint64_t base_addr = swap_remap_area.address;
       (base_addr + kRegionSizeBytes) <=
       (swap_remap_area.address + swap_remap_area.length);
       base_addr += kRegionSizeBytes) {
    free_swap_dest_areas_.push(Region(base_addr, kRegionSizeBytes));
  }
}

uint64_t RendererSwapDataImpl::SwapDiskspaceUsedBytes() const {
  // Because punching a hole may not free the block if the region compressed
  // down to a partial size, the block can only be freed when all of it has
  // been punched. So we will take the larger of what we believe we've written
  // to disk and what the swap file reports as being in use.
  uint64_t swap_file_reported_disk_size_bytes =
      swap_file_->GetUsageKB() << 10;  // Convert to bytes from KB.
  uint64_t swap_file_disk_space_used_bytes =
      std::max(swap_file_reported_disk_size_bytes, on_disk_bytes_);

  return swap_file_disk_space_used_bytes;
}

uint64_t RendererSwapDataImpl::ReclaimedBytes() const {
  return reclaimed_bytes_;
}

void RendererSwapDataImpl::AccountSwapSpace(int64_t reclaimed,
                                            int64_t swap_size) {
  on_disk_bytes_ += swap_size;
  g_global_disk_usage_bytes += swap_size;
  reclaimed_bytes_ += reclaimed;
  g_global_reclaimed_bytes += reclaimed;
}

void RendererSwapDataImpl::UnaccountSwapSpace(int64_t reclaimed,
                                              int64_t swap_size) {
  AccountSwapSpace(-reclaimed, -swap_size);
}

std::optional<Region> RendererSwapDataImpl::AllocFromSwapRegion() {
  if (free_swap_dest_areas_.empty()) {
    return std::nullopt;
  }

  Region r = free_swap_dest_areas_.top();
  free_swap_dest_areas_.pop();
  return r;
}

void RendererSwapDataImpl::DeallocFromSwapRegion(const Region& region) {
  free_swap_dest_areas_.push(region);
}

void RendererSwapDataImpl::OnReceivedPASuperPages(
    size_t size_limit_bytes,
    std::vector<::userspace_swap::mojom::MemoryRegionPtr> regions) {
  if (regions.empty())
    return;

  // Now that a list of used superpages is available, break it up into region
  // sized chunks which will be checked using the kernel pagemap.
  // Use this processes pagemap to identify regions which are in core.
  Pagemap pagemap(pid_);
  if (!pagemap.IsValid()) {
    // Dont log an error if the process is dead.
    PLOG_IF(ERROR, errno != ENOENT) << "unable to open pagemap";

    // Further swapping is not permitted.
    DisallowSwap();
    return;
  }

  std::vector<Region> resident_regions;

  PASuperPagesToResidentRegions(pagemap, regions, resident_regions);
  if (UserspaceSwapConfig::Get().shuffle_maps_on_swap) {
    // The regions can be shuffled to avoid always swapping the same regions.
    base::ranges::shuffle(resident_regions, std::default_random_engine());
  }

  if (VLOG_IS_ON(1)) {
    uint64_t total_size = 0;
    for (const auto& r : regions) {
      total_size += r.get()->length;
    }

    VLOG(1) << "Pid: " << pid_ << " got " << regions.size()
            << " memory areas totaling " << (total_size >> 20)
            << " MB the round swappable size limit is: "
            << (size_limit_bytes >> 20) << " MB which contained "
            << resident_regions.size() << " resident regions";
  }

  // TODO: Actually do the swap on the regions that have been determined to be
  // resident in the renderer.
}

void RendererSwapDataImpl::PASuperPagesToResidentRegions(
    const Pagemap& pagemap,
    const std::vector<::userspace_swap::mojom::MemoryRegionPtr>& regions,
    std::vector<Region>& resident_regions) {
  // The RegionSize is the size that is considered for swapping, this is
  // independent of PA SuperPage size and is configurable as a multiple of the
  // system page size.
  size_t kRegionSize = UserspaceSwapConfig::Get().number_of_pages_per_region *
                       base::GetPageSize();
  for (const auto& area : regions) {
    Region area_end(area->address + area->length);
    for (Region r(area->address, kRegionSize); r < area_end;
         r.address += kRegionSize) {
      if (!pagemap.IsFullyPresent(r.address, r.length)) {
        continue;
      }

      resident_regions.push_back(r);
    }
  }
}

bool RendererSwapDataImpl::Swap(size_t size_limit_bytes) {
  if (!SwapAllowed()) {
    return false;
  }

  // The first step is always to ask PA what it is using, after that we will
  // check with the kernel to see which of those areas are actually resident.
  remote_->GetPartitionAllocSuperPagesUsed(
      /* max superpages=*/-1,
      base::BindOnce(&RendererSwapDataImpl::OnReceivedPASuperPages,
                     weak_factory_.GetWeakPtr(), size_limit_bytes));

  return true;
}

}  // namespace

UserspaceSwapConfig::UserspaceSwapConfig() = default;
UserspaceSwapConfig::UserspaceSwapConfig(const UserspaceSwapConfig& other) =
    default;

// Static
COMPONENT_EXPORT(USERSPACE_SWAP)
const UserspaceSwapConfig& UserspaceSwapConfig::Get() {
  static UserspaceSwapConfig config = []() -> UserspaceSwapConfig {
    UserspaceSwapConfig config = {};

    // Populate the config object.
    config.enabled = base::FeatureList::IsEnabled(kUserspaceSwap);
    config.number_of_pages_per_region = kUserspaceSwapPagesPerRegion.Get();
    config.use_compressed_swap_file = kUserspaceSwapCompressedSwapFile.Get();
    config.minimum_swap_disk_space_available =
        kUserspaceSwapMinSwapDeviceSpaceAvailMB.Get()
        << 20;  // Convert MB to bytes.
    config.maximum_swap_disk_space_bytes =
        kUserspaceSwapMaximumSwapDiskSpaceMB.Get()
        << 20;  // Convert MB to bytes.
    config.renderer_maximum_disk_swap_file_size_bytes =
        kUserspaceSwapRendererMaximumSwapDiskSpaceMB.Get()
        << 20;  // Convert MB to bytes.
    config.renderer_region_limit_per_swap =
        kUserspaceSwapRendererRegionLimitPerSwap.Get();

    config.blocked_refault_time =
        base::Seconds(kUserspaceSwapBlockedRefaultTimeSec.Get());
    config.graph_walk_frequency = base::Seconds(
        kUserspaceSwapModeratePressureGraphWalkFrequencySec.Get());
    config.process_swap_frequency =
        base::Seconds(kUserspaceSwapProcessSwapFrequencySec.Get());
    config.invisible_time_before_swap =
        base::Seconds(kUserspaceSwapInvisibleTimeBeforeSwapSec.Get());

    config.swap_on_moderate_pressure = kUserspaceDoSwapModeratePressure.Get();
    config.swap_on_freeze = kUserspaceDoSwapOnFreeze.Get();
    config.shuffle_maps_on_swap = kUserspaceSwapShuffleMapsOrder.Get();

    return config;
  }();

  return config;
}

// An operator<< to allow us to print the values of a UserspaceSwapConfig to a
// stream.
std::ostream& operator<<(std::ostream& out, const UserspaceSwapConfig& c) {
  out << "UserspaceSwapConfig enabled: " << c.enabled << "\n";
  if (c.enabled) {
    out << "number_of_pages_per_region: " << c.number_of_pages_per_region
        << "\n";
    out << "use_compressed_swap: " << c.use_compressed_swap_file << "\n";
    out << "minimum_swap_disk_space_available: "
        << c.minimum_swap_disk_space_available << "\n";
    out << "maximum_swap_disk_space_bytes: " << c.maximum_swap_disk_space_bytes
        << "\n";
    out << "renderer_maximum_disk_swap_file_size_bytes: "
        << c.renderer_maximum_disk_swap_file_size_bytes << "\n";
    out << "renderer_region_limit_per_swap: "
        << c.renderer_region_limit_per_swap << "\n";
    out << "blocked_refault_time: " << c.blocked_refault_time << "\n";
    out << "graph_walk_frequency: " << c.graph_walk_frequency << "\n";
    out << "process_swap_frequency: " << c.process_swap_frequency << "\n";
    out << "invisible_time_before_swap: " << c.invisible_time_before_swap
        << "\n";
    out << "swap_on_freeze: " << c.swap_on_freeze << "\n";
    out << "swap_on_moderate_pressure: " << c.swap_on_moderate_pressure << "\n";
    out << "shuffle_maps_on_swap: " << c.shuffle_maps_on_swap << "\n";
  }
  return out;
}

// KernelSupportsUserspaceSwap will test for all features necessary to enable
// userspace swap.
COMPONENT_EXPORT(USERSPACE_SWAP) bool KernelSupportsUserspaceSwap() {
#if !PA_BUILDFLAG(USE_PARTITION_ALLOC_AS_MALLOC) || \
    !PA_BUILDFLAG(HAS_64_BIT_POINTERS)
  // We currently only support 64bit PartitionAlloc.
  return false;
#else
  static bool userfault_fd_supported = UserfaultFD::KernelSupportsUserfaultFD();

  // We also need to make sure the kernel supports the mremap operation with
  // MREMAP_DONTUNMAP.
  static bool mremap_dontunmap_supported = []() -> bool {
    const size_t allocation_size = base::GetPageSize();
    void* source_mapping = mmap(nullptr, allocation_size, PROT_NONE,
                                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (source_mapping == MAP_FAILED) {
      return false;
    }

    // This simple remap should only fail if MREMAP_DONTUNMAP isn't
    // supported.
    void* dest_mapping =
        mremap(source_mapping, allocation_size, allocation_size,
               MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
    if (dest_mapping == MAP_FAILED) {
      munmap(source_mapping, allocation_size);
      return false;
    }

    munmap(dest_mapping, allocation_size);
    munmap(source_mapping, allocation_size);
    return true;
  }();

  return userfault_fd_supported && mremap_dontunmap_supported;
#endif  // !PA_BUILDFLAG(USE_PARTITION_ALLOC_AS_MALLOC) ||
        // !PA_BUILDFLAG(HAS_64_BIT_POINTERS)
}

RendererSwapData::RendererSwapData() = default;
RendererSwapData::~RendererSwapData() = default;

// static
COMPONENT_EXPORT(USERSPACE_SWAP)
std::unique_ptr<RendererSwapData> RendererSwapData::Create(
    int render_process_host_id,
    base::ProcessId pid,
    std::unique_ptr<UserfaultFD> uffd,
    std::unique_ptr<SwapFile> swap_file,
    const Region& swap_remap_area,
    mojo::PendingRemote<::userspace_swap::mojom::UserspaceSwap> remote) {
  return std::make_unique<RendererSwapDataImpl>(
      render_process_host_id, pid, std::move(uffd), std::move(swap_file),
      swap_remap_area, std::move(remote));
}

COMPONENT_EXPORT(USERSPACE_SWAP) bool UserspaceSwapSupportedAndEnabled() {
  static bool enabled = UserspaceSwapConfig::Get().enabled;
  static bool supported = KernelSupportsUserspaceSwap();
  return supported && enabled;
}

COMPONENT_EXPORT(USERSPACE_SWAP)
bool SwapRenderer(RendererSwapData* data, size_t size_limit_bytes) {
  RendererSwapDataImpl* impl = reinterpret_cast<RendererSwapDataImpl*>(data);
  VLOG(1) << "SwapRenderer for rphid " << impl->render_process_host_id();
  return impl->Swap(size_limit_bytes);
}

COMPONENT_EXPORT(USERSPACE_SWAP)
bool GetPartitionAllocSuperPagesInUse(
    int32_t max_superpages,
    std::vector<::userspace_swap::mojom::MemoryRegionPtr>& regions) {
  regions.clear();
#if !PA_BUILDFLAG(USE_PARTITION_ALLOC_AS_MALLOC) || \
    !PA_BUILDFLAG(HAS_64_BIT_POINTERS)
  return false;
#else

  uint32_t superpages_remaining =
      max_superpages >= 0 ? max_superpages : UINT32_MAX;

  auto& pool_manager =
      partition_alloc::internal::AddressPoolManager::GetInstance();

  for (partition_alloc::internal::pool_handle ph :
       {partition_alloc::internal::kRegularPoolHandle,
        partition_alloc::internal::kBRPPoolHandle}) {
    uintptr_t pool_base = pool_manager.GetPoolBaseAddress(ph);
    DCHECK(pool_base);

    uintptr_t current_area = 0;
    uint64_t current_area_length = 0;

    std::bitset<partition_alloc::kMaxSuperPagesInPool> alloc_bitset;
    pool_manager.GetPoolUsedSuperPages(ph, alloc_bitset);

    for (size_t i = 0; i < alloc_bitset.size() && superpages_remaining; ++i) {
      if (alloc_bitset.test(i)) {
        superpages_remaining--;
        if (!current_area) {
          current_area = pool_base + (i * partition_alloc::kSuperPageSize);
        }

        current_area_length += partition_alloc::kSuperPageSize;
      } else {
        if (current_area) {
          regions.emplace_back(std::in_place, current_area,
                               current_area_length);
          current_area = 0;
          current_area_length = 0;
        }
      }
    }

    if (current_area) {
      regions.emplace_back(std::in_place, current_area, current_area_length);
    }

    if (!superpages_remaining)
      break;
  }

  return true;
#endif  // !PA_BUILDFLAG(USE_PARTITION_ALLOC_AS_MALLOC) ||
        // !PA_BUILDFLAG(HAS_64_BIT_POINTERS)
}

COMPONENT_EXPORT(USERSPACE_SWAP) uint64_t GetGlobalMemoryReclaimed() {
  return g_global_reclaimed_bytes.load();
}

COMPONENT_EXPORT(USERSPACE_SWAP) uint64_t GetGlobalSwapDiskspaceUsed() {
  return g_global_disk_usage_bytes.load();
}

COMPONENT_EXPORT(USERSPACE_SWAP) void DisableSwapGlobally() {
  g_global_swap_allowed = false;
}

COMPONENT_EXPORT(USERSPACE_SWAP) bool IsSwapAllowedGlobally() {
  return g_global_swap_allowed;
}

}  // namespace userspace_swap
}  // namespace memory
}  // namespace ash