chromium/chromeos/ash/components/memory/userspace_swap/userfaultfd.h

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROMEOS_ASH_COMPONENTS_MEMORY_USERSPACE_SWAP_USERFAULTFD_H_
#define CHROMEOS_ASH_COMPONENTS_MEMORY_USERSPACE_SWAP_USERFAULTFD_H_

#include <list>
#include <memory>

#include "base/component_export.h"
#include "base/files/file_descriptor_watcher_posix.h"
#include "base/files/scoped_file.h"
#include "base/synchronization/lock.h"
#include "base/threading/platform_thread.h"

struct uffd_msg;

namespace ash {
namespace memory {
namespace userspace_swap {

// UserfaultFDHandler is an interface that a class must implement to subscribe
// to UserfaultFD events. You may not receive all events, the events you will
// receive depend on the features used when the userfaultfd is created. It's
// always safe to use the UserfaultFD class associated with this handler during
// a callback, you're guaranteed that a UserfaultFD will always outlive its
// associated handler.
//
// For the purpose of a fault handler we will refer to the events received as
// two different types: pagefault events and non-pagefault events. You're
// guaranteed that pagefault events and non-pagefault events will all be
// delivered in order with respect to their group. Page fault events that
// couldn't be handled will be redelivered later until they are able to be
// handled. This isn't a problem, because when responding to a pagefault event
// any attempt to resolve it using CopyToRange or ZeroRange would fail if PTEs
// already exist or the mapping is gone. For those reasons it's important that
// you always read and handle those non-pagefault events before pagefaults. A
// more concrete example of this would be: suppose you had a MADV_DONTNEED
// racing with a pagefault to be handled. If you were to process the pagefault
// first before observing the Remove event you could potentially restore stale
// memory when the correct action after the MADV_DONTNEED would be to zero the
// range.
class COMPONENT_EXPORT(USERSPACE_SWAP) UserfaultFDHandler {
 public:
  // PagefaultFlags are passed in the Pagefault Handler.
  enum PagefaultFlags {
    kReadFault = 0,
    kWriteFault = 1 << 0,
  };

  // A Pagefault callback is delivered on a pagefault in a registered region.
  // The |fault_address| is the address that caused the fault, and |fault_flags|
  // specify any flags related to the fault, such as read or write fault.
  // Finally if kFeatureThreadID is set when the UserfaultFD is created, |tid|
  // will be set to the thread id that caused the fault, otherwise it will be
  // zero.
  //
  // The implementation is responsible for returning true or false, when true
  // is return it means the fault was handled, when false is returned the fault
  // will be retried later. The reason for this is you cannot resolve a fault
  // while mappings are changing.
  virtual bool Pagefault(uintptr_t fault_address,
                         PagefaultFlags fault_flags,
                         base::PlatformThreadId tid) = 0;

  // An Unmapped callback will be delivered when a region or subregion which was
  // registered with the UserfaultFD has been unmapped, either explicitly by an
  // munmap(2) or implicitly by a mremap(2). The range that was unmapped will be
  // specified by |range_start| to |range_end|. Unmapped callbacks will only be
  // received if the UserfaultFD was created using the kFeatureUnmap flag.
  virtual void Unmapped(uintptr_t range_start, uintptr_t range_end) = 0;

  // A Removed callback will be delivered when a region has page tables entries
  // removed, this can happen from an madvise(MADV_DONTNEED or MADV_FREE). The
  // range that was removed will be specified by |range_start| to |range_end|.
  // Removed callbacks will only be received if the UserfaultFD was created
  // using the kFeatureRemove flag.
  virtual void Removed(uintptr_t range_start, uintptr_t range_end) = 0;

  // A Remapped callback will be delivered when a region or subregion which was
  // registered with the UserfaultFD has been remapped by a call to mremap(2).
  // The region that was remapped will be described by |old_address| and
  // |original_length| the address where the mapping was moved to will be set in
  // |new_address|. Remapped callbacks will only be received if the UserfaultFD
  // was created using the kFeatureRemap flag.
  virtual void Remapped(uintptr_t old_address,
                        uintptr_t new_address,
                        uint64_t original_length) = 0;

  // Closed will be invoked when the UserfaultFD receives an EOF (closed) or an
  // error condition. |err| will be set to 0 on EOF or an errno value if the
  // read failed for an unexpected reason. Closed will always be the final
  // callback a UserfaultFDHandler will receive.
  virtual void Closed(int err) = 0;

  virtual ~UserfaultFDHandler() = default;
};

// UserfaultFD provides an implementation for the userfaultfd(2) system call.
//
// NOTE: All operations on a UserfaultFD expect page aligned addresses and
// page multiple lengths.
class COMPONENT_EXPORT(USERSPACE_SWAP) UserfaultFD {
 public:
  enum Features {
    // kFeatureRemap will subscribe to Remap callbacks.
    kFeatureRemap = 1 << 0,
    // kFeatureUnmap will subscribe to Unmap callbacks.
    kFeatureUnmap = 1 << 1,
    // kFeatureRemove will subscribe to Remove callbacks (PTEs removed).
    kFeatureRemove = 1 << 2,
    // kFeatureThreadID will cause Pagefault callbacks to include the faulting
    // thread id.
    kFeatureThreadID = 1 << 3,
  };

  // Note: Although it's documented UFFDIO_REGISTER_MDOE_WP is not actually
  // implemented as of 5.5 kernel, see:
  // https://elixir.bootlin.com/linux/v5.5-rc3/source/fs/userfaultfd.c#L1331
  // We use an enum so this can added later; it's on track to land in the 5.7
  // kernel.
  enum RegisterMode {
    // Deferred allows you to register a range but not start receiving fault
    // events on it until you've registered with kRegisterMissing.
    kRegisterDeferred = 0,
    // kRegisterMissing will register a range to receive missing page events
    // (page faults).
    kRegisterMissing = 1 << 0,
  };

  // RegisterRange will register an address range with the userfaultfd.
  bool RegisterRange(RegisterMode mode, uintptr_t range_start, uint64_t len);

  // UnregisterRange will unregister an address range with the userfaultfd.
  bool UnregisterRange(uintptr_t range_start, uint64_t len);

  // CopyToRange will resolve a fault by using the UFFDIO_COPY ioctl. This
  // uses the default behavior of waking the blocked task after the fault has
  // been resolved. |copied| will contain the number of bytes copied. It's
  // important to check |copied| when CopyToRange return false as it may have
  // copied the pages; but it can still fail to wake the range causing an
  // EAGAIN.
  bool CopyToRange(uintptr_t dest_range_start,
                   uint64_t len,
                   uintptr_t src_range_start,
                   int64_t* copied);

  // ZeroRange will zero fill a range to resolve a fault using the UFFDIO_ZERO
  // ioctl. Similarly to CopyToRange the blocked task will be woken after the
  // fault is resolved. |zeored| will return the number of bytes zeroed, it's
  // important to check |zeored| even when ZeroRange returns false as it may
  // have only failed to wake the range and would return EAGAIN in that
  // situation.
  bool ZeroRange(uintptr_t range_start, uint64_t len, int64_t* zeroed);

  // Wake any blocked tasks on this range.
  bool WakeRange(uintptr_t range_start, uint64_t len);

  // StartWaitingForEvents will create a blocking task which will monitor the
  // userfaultfd for events. The ownership of |handler| is transferred to the
  // userfaultfd class and will go out of scope when the userfaultfd is
  // closed or if there is an error. But UserfaultFDHandler::Closed() will
  // always be called before |handler| is destroyed.
  bool StartWaitingForEvents(std::unique_ptr<UserfaultFDHandler> handler);

  // CloseAndStopWaitingForEvents will trigger userfaultfd to close.
  void CloseAndStopWaitingForEvents();

  // Will return true if the userfaultfd syscall is supported.
  static bool KernelSupportsUserfaultFD();

  // Create will create a new UserfaultFD.
  static std::unique_ptr<UserfaultFD> Create(Features features);

  // Wrap FD is used to take a donated FD and assume ownership of it.
  static std::unique_ptr<UserfaultFD> WrapFD(base::ScopedFD fd);

  UserfaultFD(const UserfaultFD&) = delete;
  UserfaultFD& operator=(const UserfaultFD&) = delete;

  ~UserfaultFD();

  base::ScopedFD ReleaseFD();

 private:
  friend class UserfaultFDTest;

  explicit UserfaultFD(base::ScopedFD fd);

  void UserfaultFDReadable();

  bool DispatchMessage(const uffd_msg& msg);

  // DrainPendingFaults will attempt to deliver any pending fault messages.
  bool DrainPendingFaults();

  // Because userfaultfd will return -EAGAIN when the memory maps are changing
  // until the remap, unmap, or remove message has been read off the userfaultfd
  // we provide a mechanism for users to re-enque the fault to be delivered
  // again.
  std::list<uffd_msg> pending_faults_;

  // We need to make sure messages are read and posted in order so we prevent
  // two different threads from simultaenously reading and posting.
  base::Lock read_lock_;

  base::ScopedFD fd_;

  std::unique_ptr<UserfaultFDHandler> handler_;
  std::unique_ptr<base::FileDescriptorWatcher::Controller> watcher_controller_;
};

}  // namespace userspace_swap
}  // namespace memory
}  // namespace ash

#endif  // CHROMEOS_ASH_COMPONENTS_MEMORY_USERSPACE_SWAP_USERFAULTFD_H_