//===- PluginInterface.h - Target independent plugin device interface -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
#include <cstddef>
#include <cstdint>
#include <deque>
#include <list>
#include <map>
#include <shared_mutex>
#include <vector>
#include "ExclusiveAccess.h"
#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/EnvironmentVar.h"
#include "Shared/Requirements.h"
#include "Shared/Utils.h"
#include "GlobalHandler.h"
#include "JIT.h"
#include "MemoryManager.h"
#include "RPC.h"
#include "omptarget.h"
#ifdef OMPT_SUPPORT
#include "omp-tools.h"
#endif
#include "llvm/ADT/SmallVector.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBufferRef.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TargetParser/Triple.h"
namespace llvm {
namespace omp {
namespace target {
namespace plugin {
struct GenericPluginTy;
struct GenericKernelTy;
struct GenericDeviceTy;
struct RecordReplayTy;
/// Class that wraps the __tgt_async_info to simply its usage. In case the
/// object is constructed without a valid __tgt_async_info, the object will use
/// an internal one and will synchronize the current thread with the pending
/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function
/// must be called before destroying the wrapper object.
struct AsyncInfoWrapperTy {
AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr);
~AsyncInfoWrapperTy() {
assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized");
}
/// Get the raw __tgt_async_info pointer.
operator __tgt_async_info *() const { return AsyncInfoPtr; }
/// Indicate whether there is queue.
bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); }
/// Get the queue.
template <typename Ty> Ty getQueueAs() {
static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
"Queue is not of the same size as target type");
return static_cast<Ty>(AsyncInfoPtr->Queue);
}
/// Set the queue.
template <typename Ty> void setQueueAs(Ty Queue) {
static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
"Queue is not of the same size as target type");
assert(!AsyncInfoPtr->Queue && "Overwriting queue");
AsyncInfoPtr->Queue = Queue;
}
/// Synchronize with the __tgt_async_info's pending operations if it's the
/// internal async info. The error associated to the aysnchronous operations
/// issued in this queue must be provided in \p Err. This function will update
/// the error parameter with the result of the synchronization if it was
/// actually executed. This function must be called before destroying the
/// object and only once.
void finalize(Error &Err);
/// Register \p Ptr as an associated alloction that is freed after
/// finalization.
void freeAllocationAfterSynchronization(void *Ptr) {
AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
}
private:
GenericDeviceTy &Device;
__tgt_async_info LocalAsyncInfo;
__tgt_async_info *AsyncInfoPtr;
};
/// The information level represents the level of a key-value property in the
/// info tree print (i.e. indentation). The first level should be the default.
enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
/// Class for storing device information and later be printed. An object of this
/// type acts as a queue of key-value properties. Each property has a key, a
/// a value, and an optional unit for the value. For printing purposes, the
/// information can be classified into several levels. These levels are useful
/// for defining sections and subsections. Thus, each key-value property also
/// has an additional field indicating to which level belongs to. Notice that
/// we use the level to determine the indentation of the key-value property at
/// printing time. See the enum InfoLevelKind for the list of accepted levels.
class InfoQueueTy {
struct InfoQueueEntryTy {
std::string Key;
std::string Value;
std::string Units;
uint64_t Level;
};
std::deque<InfoQueueEntryTy> Queue;
public:
/// Add a new info entry to the queue. The entry requires at least a key
/// string in \p Key. The value in \p Value is optional and can be any type
/// that is representable as a string. The units in \p Units is optional and
/// must be a string. The info level is a template parameter that defaults to
/// the first level (top level).
template <InfoLevelKind L = InfoLevel1, typename T = std::string>
void add(const std::string &Key, T Value = T(),
const std::string &Units = std::string()) {
assert(!Key.empty() && "Invalid info key");
// Convert the value to a string depending on its type.
if constexpr (std::is_same_v<T, bool>)
Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
else if constexpr (std::is_arithmetic_v<T>)
Queue.push_back({Key, std::to_string(Value), Units, L});
else
Queue.push_back({Key, Value, Units, L});
}
/// Print all info entries added to the queue.
void print() const {
// We print four spances for each level.
constexpr uint64_t IndentSize = 4;
// Find the maximum key length (level + key) to compute the individual
// indentation of each entry.
uint64_t MaxKeySize = 0;
for (const auto &Entry : Queue) {
uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
if (KeySize > MaxKeySize)
MaxKeySize = KeySize;
}
// Print all info entries.
for (const auto &Entry : Queue) {
// Compute the indentations for the current entry.
uint64_t KeyIndentSize = Entry.Level * IndentSize;
uint64_t ValIndentSize =
MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
<< std::string(ValIndentSize, ' ') << Entry.Value
<< (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
}
}
};
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
class DeviceImageTy {
/// Image identifier within the corresponding device. Notice that this id is
/// not unique between different device; they may overlap.
int32_t ImageId;
/// The pointer to the raw __tgt_device_image.
const __tgt_device_image *TgtImage;
const __tgt_device_image *TgtImageBitcode;
/// Reference to the device this image is loaded on.
GenericDeviceTy &Device;
/// If this image has any global destructors that much be called.
/// FIXME: This is only required because we currently have no invariants
/// towards the lifetime of the underlying image. We should either copy
/// the image into memory locally or erase the pointers after init.
bool PendingGlobalDtors;
public:
DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
const __tgt_device_image *Image)
: ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
PendingGlobalDtors(false) {
assert(TgtImage && "Invalid target image");
}
/// Get the image identifier within the device.
int32_t getId() const { return ImageId; }
/// Get the device that this image is loaded onto.
GenericDeviceTy &getDevice() const { return Device; }
/// Get the pointer to the raw __tgt_device_image.
const __tgt_device_image *getTgtImage() const { return TgtImage; }
void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
this->TgtImageBitcode = TgtImageBitcode;
}
const __tgt_device_image *getTgtImageBitcode() const {
return TgtImageBitcode;
}
/// Get the image starting address.
void *getStart() const { return TgtImage->ImageStart; }
/// Get the image size.
size_t getSize() const {
return utils::getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
}
/// Get a memory buffer reference to the whole image.
MemoryBufferRef getMemoryBuffer() const {
return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
"Image");
}
/// Accessors to the boolean value
bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
};
/// Class implementing common functionalities of offload kernels. Each plugin
/// should define the specific kernel class, derive from this generic one, and
/// implement the necessary virtual function members.
struct GenericKernelTy {
/// Construct a kernel with a name and a execution mode.
GenericKernelTy(const char *Name)
: Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
virtual ~GenericKernelTy() {}
/// Initialize the kernel object from a specific device.
Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
virtual Error initImpl(GenericDeviceTy &GenericDevice,
DeviceImageTy &Image) = 0;
/// Launch the kernel on the specific device. The device must be the same
/// one used to initialize the kernel.
Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
/// Get the kernel name.
const char *getName() const { return Name; }
/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
return *ImagePtr;
}
/// Return the kernel environment object for kernel \p Name.
const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
return KernelEnvironment;
}
/// Return a device pointer to a new kernel launch environment.
Expected<KernelLaunchEnvironmentTy *>
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
AsyncInfoWrapperTy &AsyncInfo) const;
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
switch (ExecutionMode) {
case OMP_TGT_EXEC_MODE_SPMD:
case OMP_TGT_EXEC_MODE_GENERIC:
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return true;
}
return false;
}
protected:
/// Get the execution mode name of the kernel.
const char *getExecutionModeName() const {
switch (KernelEnvironment.Configuration.ExecMode) {
case OMP_TGT_EXEC_MODE_SPMD:
return "SPMD";
case OMP_TGT_EXEC_MODE_GENERIC:
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
}
llvm_unreachable("Unknown execution mode!");
}
/// Prints generic kernel launch information.
Error printLaunchInfo(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads,
uint64_t NumBlocks) const;
/// Prints plugin-specific kernel launch information after generic kernel
/// launch information
virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const;
private:
/// Prepare the arguments before launching the kernel.
KernelLaunchParamsTy
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs,
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
/// Get the number of threads and blocks for the kernel based on the
/// user-defined threads and block clauses.
uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
uint32_t ThreadLimitClause[3]) const;
/// The number of threads \p NumThreads can be adjusted by this method.
/// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
/// thread_limit clause.
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
bool isGenericSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
}
bool isGenericMode() const {
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC;
}
bool isSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
}
/// The kernel name.
const char *Name;
/// The image that contains this kernel.
DeviceImageTy *ImagePtr = nullptr;
protected:
/// The preferred number of threads to run the kernel.
uint32_t PreferredNumThreads;
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;
/// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment;
/// The prototype kernel launch environment.
KernelLaunchEnvironmentTy KernelLaunchEnvironment;
/// If the kernel is a bare kernel.
bool IsBareKernel = false;
};
/// Information about an allocation, when it has been allocated, and when/if it
/// has been deallocated, for error reporting purposes.
struct AllocationTraceInfoTy {
/// The stack trace of the allocation itself.
std::string AllocationTrace;
/// The stack trace of the deallocation, or empty.
std::string DeallocationTrace;
/// The allocated device pointer.
void *DevicePtr = nullptr;
/// The corresponding host pointer (can be null).
void *HostPtr = nullptr;
/// The size of the allocation.
uint64_t Size = 0;
/// The kind of the allocation.
TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT;
/// Information about the last allocation at this address, if any.
AllocationTraceInfoTy *LastAllocationInfo = nullptr;
/// Lock to keep accesses race free.
std::mutex Lock;
};
/// Information about an allocation, when it has been allocated, and when/if it
/// has been deallocated, for error reporting purposes.
struct KernelTraceInfoTy {
/// The launched kernel.
GenericKernelTy *Kernel;
/// The stack trace of the launch itself.
std::string LaunchTrace;
/// The async info the kernel was launched in.
__tgt_async_info *AsyncInfo;
};
struct KernelTraceInfoRecordTy {
KernelTraceInfoRecordTy() { KTIs.fill({}); }
/// Return the (maximal) record size.
auto size() const { return KTIs.size(); }
/// Create a new kernel trace info and add it into the record.
void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
__tgt_async_info *AsyncInfo) {
KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
Idx = (Idx + 1) % size();
}
/// Return the \p I'th last kernel trace info.
auto getKernelTraceInfo(int32_t I) const {
// Note that kernel trace infos "grow forward", so lookup is backwards.
return KTIs[(Idx - I - 1 + size()) % size()];
}
private:
std::array<KernelTraceInfoTy, 8> KTIs;
unsigned Idx = 0;
};
/// Class representing a map of host pinned allocations. We track these pinned
/// allocations, so memory tranfers invloving these buffers can be optimized.
class PinnedAllocationMapTy {
/// Struct representing a map entry.
struct EntryTy {
/// The host pointer of the pinned allocation.
void *HstPtr;
/// The pointer that devices' driver should use to transfer data from/to the
/// pinned allocation. In most plugins, this pointer will be the same as the
/// host pointer above.
void *DevAccessiblePtr;
/// The size of the pinned allocation.
size_t Size;
/// Indicate whether the allocation was locked from outside the plugin, for
/// instance, from the application. The externally locked allocations are
/// not unlocked by the plugin when unregistering the last user.
bool ExternallyLocked;
/// The number of references to the pinned allocation. The allocation should
/// remain pinned and registered to the map until the number of references
/// becomes zero.
mutable size_t References;
/// Create an entry with the host and device acessible pointers, the buffer
/// size, and a boolean indicating whether the buffer was locked externally.
EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
bool ExternallyLocked)
: HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size),
ExternallyLocked(ExternallyLocked), References(1) {}
/// Utility constructor used for std::set searches.
EntryTy(void *HstPtr)
: HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0),
ExternallyLocked(false), References(0) {}
};
/// Comparator of mep entries. Use the host pointer to enforce an order
/// between entries.
struct EntryCmpTy {
bool operator()(const EntryTy &Left, const EntryTy &Right) const {
return Left.HstPtr < Right.HstPtr;
}
};
typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy;
/// The map of host pinned allocations.
PinnedAllocSetTy Allocs;
/// The mutex to protect accesses to the map.
mutable std::shared_mutex Mutex;
/// Reference to the corresponding device.
GenericDeviceTy &Device;
/// Indicate whether mapped host buffers should be locked automatically.
bool LockMappedBuffers;
/// Indicate whether failures when locking mapped buffers should be ingored.
bool IgnoreLockMappedFailures;
/// Find an allocation that intersects with \p HstPtr pointer. Assume the
/// map's mutex is acquired.
const EntryTy *findIntersecting(const void *HstPtr) const {
if (Allocs.empty())
return nullptr;
// Search the first allocation with starting address that is not less than
// the buffer address.
auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)});
// Direct match of starting addresses.
if (It != Allocs.end() && It->HstPtr == HstPtr)
return &(*It);
// Not direct match but may be a previous pinned allocation in the map which
// contains the buffer. Return false if there is no such a previous
// allocation.
if (It == Allocs.begin())
return nullptr;
// Move to the previous pinned allocation.
--It;
// The buffer is not contained in the pinned allocation.
if (utils::advancePtr(It->HstPtr, It->Size) > HstPtr)
return &(*It);
// None found.
return nullptr;
}
/// Insert an entry to the map representing a locked buffer. The number of
/// references is set to one.
Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size,
bool ExternallyLocked = false);
/// Erase an existing entry from the map.
Error eraseEntry(const EntryTy &Entry);
/// Register a new user into an entry that represents a locked buffer. Check
/// also that the registered buffer with \p HstPtr address and \p Size is
/// actually contained into the entry.
Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size);
/// Unregister a user from the entry and return whether it is the last user.
/// If it is the last user, the entry will have to be removed from the map
/// and unlock the entry's host buffer (if necessary).
Expected<bool> unregisterEntryUse(const EntryTy &Entry);
/// Indicate whether the first range A fully contains the second range B.
static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
void *EndA = utils::advancePtr(PtrA, SizeA);
void *EndB = utils::advancePtr(PtrB, SizeB);
return (PtrB >= PtrA && EndB <= EndA);
}
/// Indicate whether the first range A intersects with the second range B.
static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
void *EndA = utils::advancePtr(PtrA, SizeA);
void *EndB = utils::advancePtr(PtrB, SizeB);
return (PtrA < EndB && PtrB < EndA);
}
public:
/// Create the map of pinned allocations corresponding to a specific device.
PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {
// Envar that indicates whether mapped host buffers should be locked
// automatically. The possible values are boolean (on/off) and a special:
// off: Mapped host buffers are not locked.
// on: Mapped host buffers are locked in a best-effort approach.
// Failure to lock the buffers are silent.
// mandatory: Mapped host buffers are always locked and failures to lock
// a buffer results in a fatal error.
StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS",
"off");
bool Enabled;
if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) {
// Parsed as a boolean value. Enable the feature if necessary.
LockMappedBuffers = Enabled;
IgnoreLockMappedFailures = true;
} else if (OMPX_LockMappedBuffers.get() == "mandatory") {
// Enable the feature and failures are fatal.
LockMappedBuffers = true;
IgnoreLockMappedFailures = false;
} else {
// Disable by default.
DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n",
OMPX_LockMappedBuffers.get().data());
LockMappedBuffers = false;
}
}
/// Register a buffer that was recently allocated as a locked host buffer.
/// None of the already registered pinned allocations should intersect with
/// this new one. The registration requires the host pointer in \p HstPtr,
/// the device accessible pointer in \p DevAccessiblePtr, and the size of the
/// allocation in \p Size. The allocation must be unregistered using the
/// unregisterHostBuffer function.
Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size);
/// Unregister a host pinned allocation passing the host pointer which was
/// previously registered using the registerHostBuffer function. When calling
/// this function, the pinned allocation cannot have any other user and will
/// not be unlocked by this function.
Error unregisterHostBuffer(void *HstPtr);
/// Lock the host buffer at \p HstPtr or register a new user if it intersects
/// with an already existing one. A partial overlapping with extension is not
/// allowed. The function returns the device accessible pointer of the pinned
/// buffer. The buffer must be unlocked using the unlockHostBuffer function.
Expected<void *> lockHostBuffer(void *HstPtr, size_t Size);
/// Unlock the host buffer at \p HstPtr or unregister a user if other users
/// are still using the pinned allocation. If this was the last user, the
/// pinned allocation is removed from the map and the memory is unlocked.
Error unlockHostBuffer(void *HstPtr);
/// Lock or register a host buffer that was recently mapped by libomptarget.
/// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is
/// enabled. Even if not enabled, externally locked buffers are registered
/// in order to optimize their transfers.
Error lockMappedHostBuffer(void *HstPtr, size_t Size);
/// Unlock or unregister a host buffer that was unmapped by libomptarget.
Error unlockUnmappedHostBuffer(void *HstPtr);
/// Return the device accessible pointer associated to the host pinned
/// allocation which the \p HstPtr belongs, if any. Return null in case the
/// \p HstPtr does not belong to any host pinned allocation. The device
/// accessible pointer is the one that devices should use for data transfers
/// that involve a host pinned buffer.
void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const {
std::shared_lock<std::shared_mutex> Lock(Mutex);
// Find the intersecting allocation if any.
const EntryTy *Entry = findIntersecting(HstPtr);
if (!Entry)
return nullptr;
return utils::advancePtr(Entry->DevAccessiblePtr,
utils::getPtrDiff(HstPtr, Entry->HstPtr));
}
/// Check whether a buffer belongs to a registered host pinned allocation.
bool isHostPinnedBuffer(const void *HstPtr) const {
std::shared_lock<std::shared_mutex> Lock(Mutex);
// Return whether there is an intersecting allocation.
return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr);
}
};
/// Class implementing common functionalities of offload devices. Each plugin
/// should define the specific device class, derive from this generic one, and
/// implement the necessary virtual function members.
struct GenericDeviceTy : public DeviceAllocatorTy {
/// Construct a device with its device id within the plugin, the number of
/// devices in the plugin and the grid values for that kind of device.
GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
const llvm::omp::GV &GridValues);
/// Get the device identifier within the corresponding plugin. Notice that
/// this id is not unique between different plugins; they may overlap.
int32_t getDeviceId() const { return DeviceId; }
/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
/// Initialize the device. After this call, the device should be already
/// working and ready to accept queries or modifications.
Error init(GenericPluginTy &Plugin);
virtual Error initImpl(GenericPluginTy &Plugin) = 0;
/// Deinitialize the device and free all its resources. After this call, the
/// device is no longer considered ready, so no queries or modifications are
/// allowed.
Error deinit(GenericPluginTy &Plugin);
virtual Error deinitImpl() = 0;
/// Load the binary image into the device and return the target table.
Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
const __tgt_device_image *TgtImage);
virtual Expected<DeviceImageTy *>
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
/// Setup the device environment if needed. Notice this setup may not be run
/// on some plugins. By default, it will be executed, but plugins can change
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
/// Setup the global device memory pool, if the plugin requires one.
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
uint64_t PoolSize);
// Setup the RPC server for this device if needed. This may not run on some
// plugins like the CPU targets. By default, it will not be executed so it is
// up to the target to override this using the shouldSetupRPCServer function.
Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
/// Synchronize the current thread with the pending operations on the
/// __tgt_async_info structure.
Error synchronize(__tgt_async_info *AsyncInfo);
virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
/// Invokes any global constructors on the device if present and is required
/// by the target.
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) {
return Error::success();
}
/// Invokes any global destructors on the device if present and is required
/// by the target.
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) {
return Error::success();
}
/// Query for the completion of the pending operations on the __tgt_async_info
/// structure in a non-blocking manner.
Error queryAsync(__tgt_async_info *AsyncInfo);
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
/// Check whether the architecture supports VA management
virtual bool supportVAManagement() const { return false; }
/// Get the total device memory size
virtual Error getDeviceMemorySize(uint64_t &DSize);
/// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
/// map it to \p VAddr. The obtained address is stored in \p Addr. At return
/// \p RSize contains the actual size which can be equal or larger than the
/// requested size.
virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
/// De-allocates device memory and unmaps the virtual address \p VAddr
virtual Error memoryVAUnMap(void *VAddr, size_t Size);
/// Allocate data on the device or involving the device.
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
/// Deallocate data from the device or involving the device.
Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
/// Pin host memory to optimize transfers and return the device accessible
/// pointer that devices should use for memory transfers involving the host
/// pinned allocation.
Expected<void *> dataLock(void *HstPtr, int64_t Size) {
return PinnedAllocs.lockHostBuffer(HstPtr, Size);
}
/// Unpin a host memory buffer that was previously pinned.
Error dataUnlock(void *HstPtr) {
return PinnedAllocs.unlockHostBuffer(HstPtr);
}
/// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific
/// API and return the device accessible pointer.
virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0;
/// Unlock a previously locked host buffer starting at \p HstPtr.
virtual Error dataUnlockImpl(void *HstPtr) = 0;
/// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped
/// buffer. This means that libomptarget created a new mapping of that host
/// buffer (e.g., because a user OpenMP target map) and the buffer may be used
/// as source/destination of memory transfers. We can use this information to
/// lock the host buffer and optimize its memory transfers.
Error notifyDataMapped(void *HstPtr, int64_t Size) {
return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
}
/// Mark the host buffer with address \p HstPtr as unmapped. This means that
/// libomptarget removed an existing mapping. If the plugin locked the buffer
/// in notifyDataMapped, this function should unlock it.
Error notifyDataUnmapped(void *HstPtr) {
return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
}
/// Check whether the host buffer with address \p HstPtr is pinned by the
/// underlying vendor-specific runtime (if any). Retrieve the host pointer,
/// the device accessible pointer and the size of the original pinned buffer.
virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
void *&BaseDevAccessiblePtr,
size_t &BaseSize) const = 0;
/// Submit data to the device (host to device transfer).
Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Retrieve data from the device (device to host transfer).
Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Exchange data between devices (device to device transfer). Calling this
/// function is only valid if GenericPlugin::isDataExchangable() passing the
/// two devices returns true.
Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
int64_t Size, __tgt_async_info *AsyncInfo);
virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Run the kernel associated with \p EntryPtr
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
/// Initialize a __tgt_async_info structure. Related to interop features.
Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Initialize a __tgt_device_info structure. Related to interop features.
Error initDeviceInfo(__tgt_device_info *DeviceInfo);
virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
/// Create an event.
Error createEvent(void **EventPtrStorage);
virtual Error createEventImpl(void **EventPtrStorage) = 0;
/// Destroy an event.
Error destroyEvent(void *Event);
virtual Error destroyEventImpl(void *EventPtr) = 0;
/// Start the recording of the event.
Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
virtual Error recordEventImpl(void *EventPtr,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Wait for an event to finish. Notice this wait is asynchronous if the
/// __tgt_async_info is not nullptr.
Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
virtual Error waitEventImpl(void *EventPtr,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Synchronize the current thread with the event.
Error syncEvent(void *EventPtr);
virtual Error syncEventImpl(void *EventPtr) = 0;
/// Print information about the device.
Error printInfo();
virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
/// Getters of the grid values.
uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
uint32_t getDefaultNumThreads() const {
return GridValues.GV_Default_WG_Size;
}
uint32_t getDefaultNumBlocks() const {
return GridValues.GV_Default_Num_Teams;
}
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
/// Get target compute unit kind (e.g., sm_80, or gfx908).
virtual std::string getComputeUnitKind() const { return "unknown"; }
/// Post processing after jit backend. The ownership of \p MB will be taken.
virtual Expected<std::unique_ptr<MemoryBuffer>>
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
return std::move(MB);
}
/// The minimum number of threads we use for a low-trip count combined loop.
/// Instead of using more threads we increase the outer (block/team)
/// parallelism.
/// @see OMPX_MinThreadsForLowTripCount
virtual uint32_t getMinThreadsForLowTripCountLoop() {
return OMPX_MinThreadsForLowTripCount;
}
/// Whether or not to reuse blocks for high trip count loops.
/// @see OMPX_ReuseBlocksForHighTripCount
bool getReuseBlocksForHighTripCount() {
return OMPX_ReuseBlocksForHighTripCount;
}
/// Get the total amount of hardware parallelism supported by the target
/// device. This is the total amount of warps or wavefronts that can be
/// resident on the device simultaneously.
virtual uint64_t getHardwareParallelism() const { return 0; }
/// Get the RPC server running on this device.
RPCServerTy *getRPCServer() const { return RPCServer; }
/// The number of parallel RPC ports to use on the device. In general, this
/// should be roughly equivalent to the amount of hardware parallelism the
/// device can support. This is because GPUs in general do not have forward
/// progress guarantees, so we minimize thread level dependencies by
/// allocating enough space such that each device thread can have a port. This
/// is likely overly pessimistic in the average case, but guarantees no
/// deadlocks at the cost of memory. This must be overloaded by targets
/// expecting to use the RPC server.
virtual uint64_t requestedRPCPortCount() const {
assert(!shouldSetupRPCServer() && "Default implementation cannot be used");
return 0;
}
virtual Error getDeviceStackSize(uint64_t &V) = 0;
/// Returns true if current plugin architecture is an APU
/// and unified_shared_memory was not requested by the program.
bool useAutoZeroCopy();
virtual bool useAutoZeroCopyImpl() { return false; }
/// Allocate and construct a kernel object.
virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
/// Reference to the underlying plugin that created this device.
GenericPluginTy &Plugin;
/// Map to record when allocations have been performed, and when they have
/// been deallocated, both for error reporting purposes.
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
/// Return the allocation trace info for a device pointer, that is the
/// allocation into which this device pointer points to (or pointed into).
AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) {
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
for (auto &It : *AllocationTraceMap) {
if (It.first <= DevicePtr &&
utils::advancePtr(It.first, It.second->Size) > DevicePtr)
return It.second;
}
return nullptr;
}
/// Return the allocation trace info for a device pointer, that is the
/// allocation into which this device pointer points to (or pointed into).
AllocationTraceInfoTy *
getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) {
Distance = 0;
if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) {
return ATI;
}
AllocationTraceInfoTy *ATI = nullptr;
uintptr_t DevicePtrI = uintptr_t(DevicePtr);
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
for (auto &It : *AllocationTraceMap) {
uintptr_t Begin = uintptr_t(It.second->DevicePtr);
uintptr_t End = Begin + It.second->Size - 1;
uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End);
if (ATI && ItDistance > Distance)
continue;
ATI = It.second;
Distance = ItDistance;
}
return ATI;
}
/// Map to record kernel have been launchedl, for error reporting purposes.
ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
/// Environment variable to determine if stack traces for kernel launches are
/// tracked.
UInt32Envar OMPX_TrackNumKernelLaunches =
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
/// Environment variable to determine if stack traces for allocations and
/// deallocations are tracked.
BoolEnvar OMPX_TrackAllocationTraces =
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
private:
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
virtual Error setDeviceStackSize(uint64_t V) = 0;
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
/// Indicate whether the device should setup the device environment. Notice
/// that returning false in this function will change the behavior of the
/// setupDeviceEnvironment() function.
virtual bool shouldSetupDeviceEnvironment() const { return true; }
/// Indicate whether the device should setup the global device memory pool. If
/// false is return the value on the device will be uninitialized.
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
/// Indicate whether or not the device should setup the RPC server. This is
/// only necessary for unhosted targets like the GPU.
virtual bool shouldSetupRPCServer() const { return false; }
/// Pointer to the memory manager or nullptr if not available.
MemoryManagerTy *MemoryManager;
/// Environment variables defined by the OpenMP standard.
Int32Envar OMP_TeamLimit;
Int32Envar OMP_NumTeams;
Int32Envar OMP_TeamsThreadLimit;
/// Environment variables defined by the LLVM OpenMP implementation.
Int32Envar OMPX_DebugKind;
UInt32Envar OMPX_SharedMemorySize;
UInt64Envar OMPX_TargetStackSize;
UInt64Envar OMPX_TargetHeapSize;
/// Environment flag to set the minimum number of threads we use for a
/// low-trip count combined loop. Instead of using more threads we increase
/// the outer (block/team) parallelism.
UInt32Envar OMPX_MinThreadsForLowTripCount =
UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
BoolEnvar OMPX_ReuseBlocksForHighTripCount =
BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
protected:
/// Environment variables defined by the LLVM OpenMP implementation
/// regarding the initial number of streams and events.
UInt32Envar OMPX_InitialNumStreams;
UInt32Envar OMPX_InitialNumEvents;
/// Array of images loaded into the device. Images are automatically
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;
/// The identifier of the device within the plugin. Notice this is not a
/// global device id and is not the device id visible to the OpenMP user.
const int32_t DeviceId;
/// The default grid values used for this device.
llvm::omp::GV GridValues;
/// Enumeration used for representing the current state between two devices
/// two devices (both under the same plugin) for the peer access between them.
/// The states can be a) PENDING when the state has not been queried and needs
/// to be queried, b) AVAILABLE when the peer access is available to be used,
/// and c) UNAVAILABLE if the system does not allow it.
enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
/// Array of peer access states with the rest of devices. This means that if
/// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
/// the device I can access device J's memory directly. However, notice this
/// does not mean that device J can access device I's memory directly.
llvm::SmallVector<PeerAccessState> PeerAccesses;
std::mutex PeerAccessesLock;
/// Map of host pinned allocations used for optimize device transfers.
PinnedAllocationMapTy PinnedAllocs;
/// A pointer to an RPC server instance attached to this device if present.
/// This is used to run the RPC server during task synchronization.
RPCServerTy *RPCServer;
#ifdef OMPT_SUPPORT
/// OMPT callback functions
#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
#undef defineOmptCallback
/// Internal representation for OMPT device (initialize & finalize)
std::atomic<bool> OmptInitialized;
#endif
private:
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
};
/// Class implementing common functionalities of offload plugins. Each plugin
/// should define the specific plugin class, derive from this generic one, and
/// implement the necessary virtual function members.
struct GenericPluginTy {
/// Construct a plugin instance.
GenericPluginTy(Triple::ArchType TA)
: GlobalHandler(nullptr), JIT(TA), RPCServer(nullptr),
RecordReplay(nullptr) {}
virtual ~GenericPluginTy() {}
/// Initialize the plugin.
Error init();
/// Initialize the plugin and return the number of available devices.
virtual Expected<int32_t> initImpl() = 0;
/// Deinitialize the plugin and release the resources.
Error deinit();
virtual Error deinitImpl() = 0;
/// Create a new device for the underlying plugin.
virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin,
int32_t DeviceID,
int32_t NumDevices) = 0;
/// Create a new global handler for the underlying plugin.
virtual GenericGlobalHandlerTy *createGlobalHandler() = 0;
/// Get the reference to the device with a certain device id.
GenericDeviceTy &getDevice(int32_t DeviceId) {
assert(isValidDeviceId(DeviceId) && "Invalid device id");
assert(Devices[DeviceId] && "Device is unitialized");
return *Devices[DeviceId];
}
/// Get the number of active devices.
int32_t getNumDevices() const { return NumDevices; }
/// Get the plugin-specific device identifier.
int32_t getUserId(int32_t DeviceId) const {
assert(UserDeviceIds.contains(DeviceId) && "No user-id registered");
return UserDeviceIds.at(DeviceId);
}
/// Get the ELF code to recognize the binary image of this plugin.
virtual uint16_t getMagicElfBits() const = 0;
/// Get the target triple of this plugin.
virtual Triple::ArchType getTripleArch() const = 0;
/// Get the constant name identifier for this plugin.
virtual const char *getName() const = 0;
/// Allocate a structure using the internal allocator.
template <typename Ty> Ty *allocate() {
return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
}
/// Get the reference to the global handler of this plugin.
GenericGlobalHandlerTy &getGlobalHandler() {
assert(GlobalHandler && "Global handler not initialized");
return *GlobalHandler;
}
/// Get the reference to the JIT used for all devices connected to this
/// plugin.
JITEngine &getJIT() { return JIT; }
/// Get a reference to the RPC server used to provide host services.
RPCServerTy &getRPCServer() {
assert(RPCServer && "RPC server not initialized");
return *RPCServer;
}
/// Get a reference to the record and replay interface for the plugin.
RecordReplayTy &getRecordReplay() {
assert(RecordReplay && "RR interface not initialized");
return *RecordReplay;
}
/// Initialize a device within the plugin.
Error initDevice(int32_t DeviceId);
/// Deinitialize a device within the plugin and release its resources.
Error deinitDevice(int32_t DeviceId);
/// Indicate whether data can be exchanged directly between two devices under
/// this same plugin. If this function returns true, it's safe to call the
/// GenericDeviceTy::exchangeData() function on the source device.
virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
}
/// Top level interface to verify if a given ELF image can be executed on a
/// given target. Returns true if the \p Image is compatible with the plugin.
Expected<bool> checkELFImage(StringRef Image) const;
/// Return true if the \p Image can be compiled to run on the platform's
/// target architecture.
Expected<bool> checkBitcodeImage(StringRef Image) const;
/// Indicate if an image is compatible with the plugin devices. Notice that
/// this function may be called before actually initializing the devices. So
/// we could not move this function into GenericDeviceTy.
virtual Expected<bool> isELFCompatible(uint32_t DeviceID,
StringRef Image) const = 0;
protected:
/// Indicate whether a device id is valid.
bool isValidDeviceId(int32_t DeviceId) const {
return (DeviceId >= 0 && DeviceId < getNumDevices());
}
public:
// TODO: This plugin interface needs to be cleaned up.
/// Returns non-zero if the plugin runtime has been initialized.
int32_t is_initialized() const;
/// Returns non-zero if the \p Image is compatible with the plugin. This
/// function does not require the plugin to be initialized before use.
int32_t is_plugin_compatible(__tgt_device_image *Image);
/// Returns non-zero if the \p Image is compatible with the device.
int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image);
/// Returns non-zero if the plugin device has been initialized.
int32_t is_device_initialized(int32_t DeviceId) const;
/// Initialize the device inside of the plugin.
int32_t init_device(int32_t DeviceId);
/// Return the number of devices this plugin can support.
int32_t number_of_devices();
/// Returns non-zero if the data can be exchanged between the two devices.
int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
/// Initializes the record and replay mechanism inside the plugin.
int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
void *VAddr, bool isRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset);
/// Loads the associated binary into the plugin and returns a handle to it.
int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
__tgt_device_binary *Binary);
/// Allocates memory that is accessively to the given device.
void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind);
/// Deallocates memory on the given device.
int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
/// Locks / pins host memory using the plugin runtime.
int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
void **LockedPtr);
/// Unlocks / unpins host memory using the plugin runtime.
int32_t data_unlock(int32_t DeviceId, void *Ptr);
/// Notify the runtime about a new mapping that has been created outside.
int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
/// Notify t he runtime about a mapping that has been deleted.
int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
/// Copy data to the given device.
int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
int64_t Size);
/// Copy data to the given device asynchronously.
int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
int64_t Size, __tgt_async_info *AsyncInfoPtr);
/// Copy data from the given device.
int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
int64_t Size);
/// Copy data from the given device asynchornously.
int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
int64_t Size, __tgt_async_info *AsyncInfoPtr);
/// Exchange memory addresses between two devices.
int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId,
void *DstPtr, int64_t Size);
/// Exchange memory addresses between two devices asynchronously.
int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
int DstDeviceId, void *DstPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
/// Begin executing a kernel on the given device.
int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
__tgt_async_info *AsyncInfoPtr);
/// Synchronize an asyncrhonous queue with the plugin runtime.
int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
/// Query the current state of an asynchronous queue.
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);
/// Creates an event in the given plugin if supported.
int32_t create_event(int32_t DeviceId, void **EventPtr);
/// Records an event that has occurred.
int32_t record_event(int32_t DeviceId, void *EventPtr,
__tgt_async_info *AsyncInfoPtr);
/// Wait until an event has occurred.
int32_t wait_event(int32_t DeviceId, void *EventPtr,
__tgt_async_info *AsyncInfoPtr);
/// Syncrhonize execution until an event is done.
int32_t sync_event(int32_t DeviceId, void *EventPtr);
/// Remove the event from the plugin.
int32_t destroy_event(int32_t DeviceId, void *EventPtr);
/// Remove the event from the plugin.
void set_info_flag(uint32_t NewInfoLevel);
/// Creates an asynchronous queue for the given plugin.
int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
/// Creates device information to be used for diagnostics.
int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
const char **ErrStr);
/// Sets the offset into the devices for use by OMPT.
int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
/// Returns if the plugin can support auotmatic copy.
int32_t use_auto_zero_copy(int32_t DeviceId);
/// Look up a global symbol in the given binary.
int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
const char *Name, void **DevicePtr);
/// Look up a kernel function in the given binary.
int32_t get_function(__tgt_device_binary Binary, const char *Name,
void **KernelPtr);
private:
/// Indicates if the platform runtime has been fully initialized.
bool Initialized = false;
/// Number of devices available for the plugin.
int32_t NumDevices = 0;
/// Map of plugin device identifiers to the user device identifier.
llvm::DenseMap<int32_t, int32_t> UserDeviceIds;
/// Array of pointers to the devices. Initially, they are all set to nullptr.
/// Once a device is initialized, the pointer is stored in the position given
/// by its device id. A position with nullptr means that the corresponding
/// device was not initialized yet.
llvm::SmallVector<GenericDeviceTy *> Devices;
/// Pointer to the global handler for this plugin.
GenericGlobalHandlerTy *GlobalHandler;
/// Internal allocator for different structures.
BumpPtrAllocator Allocator;
/// The JIT engine shared by all devices connected to this plugin.
JITEngine JIT;
/// The interface between the plugin and the GPU for host services.
RPCServerTy *RPCServer;
/// The interface between the plugin and the GPU for host services.
RecordReplayTy *RecordReplay;
};
namespace Plugin {
/// Create a success error. This is the same as calling Error::success(), but
/// it is recommended to use this one for consistency with Plugin::error() and
/// Plugin::check().
static inline Error success() { return Error::success(); }
/// Create a string error.
template <typename... ArgsTy>
static Error error(const char *ErrFmt, ArgsTy... Args) {
return createStringError(inconvertibleErrorCode(), ErrFmt, Args...);
}
/// Check the plugin-specific error code and return an error or success
/// accordingly. In case of an error, create a string error with the error
/// description. The ErrFmt should follow the format:
/// "Error in <function name>[<optional info>]: %s"
/// The last format specifier "%s" is mandatory and will be used to place the
/// error code's description. Notice this function should be only called from
/// the plugin-specific code.
/// TODO: Refactor this, must be defined individually by each plugin.
template <typename... ArgsTy>
static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
} // namespace Plugin
/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
/// acts as a reference to a device resource, such as a stream, and requires
/// some basic functions to be implemented. The derived class should define an
/// empty constructor that creates an empty and invalid resource reference. Do
/// not create a new resource on the ctor, but on the create() function instead.
///
/// The derived class should also define the type HandleTy as the underlying
/// resource handle type. For instance, in a CUDA stream it would be:
/// using HandleTy = CUstream;
struct GenericDeviceResourceRef {
/// Create a new resource and stores a reference.
virtual Error create(GenericDeviceTy &Device) = 0;
/// Destroy and release the resources pointed by the reference.
virtual Error destroy(GenericDeviceTy &Device) = 0;
protected:
~GenericDeviceResourceRef() = default;
};
/// Class that implements a resource pool belonging to a device. This class
/// operates with references to the actual resources. These reference must
/// derive from the GenericDeviceResourceRef class and implement the create
/// and destroy virtual functions.
template <typename ResourceRef> class GenericDeviceResourceManagerTy {
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
using ResourceHandleTy = typename ResourceRef::HandleTy;
public:
/// Create an empty resource pool for a specific device.
GenericDeviceResourceManagerTy(GenericDeviceTy &Device)
: Device(Device), NextAvailable(0) {}
/// Destroy the resource pool. At this point, the deinit() function should
/// already have been executed so the resource pool should be empty.
virtual ~GenericDeviceResourceManagerTy() {
assert(ResourcePool.empty() && "Resource pool not empty");
}
/// Initialize the resource pool.
Error init(uint32_t InitialSize) {
assert(ResourcePool.empty() && "Resource pool already initialized");
return ResourcePoolTy::resizeResourcePool(InitialSize);
}
/// Deinitialize the resource pool and delete all resources. This function
/// must be called before the destructor.
virtual Error deinit() {
if (NextAvailable)
DP("Missing %d resources to be returned\n", NextAvailable);
// TODO: This prevents a bug on libomptarget to make the plugins fail. There
// may be some resources not returned. Do not destroy these ones.
if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
return Err;
ResourcePool.clear();
return Plugin::success();
}
/// Get a resource from the pool or create new ones. If the function
/// succeeds, the handle to the resource is saved in \p Handle.
virtual Error getResource(ResourceHandleTy &Handle) {
// Get a resource with an empty resource processor.
return getResourcesImpl(1, &Handle,
[](ResourceHandleTy) { return Plugin::success(); });
}
/// Get multiple resources from the pool or create new ones. If the function
/// succeeds, the handles to the resources are saved in \p Handles.
virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) {
// Get resources with an empty resource processor.
return getResourcesImpl(Num, Handles,
[](ResourceHandleTy) { return Plugin::success(); });
}
/// Return resource to the pool.
virtual Error returnResource(ResourceHandleTy Handle) {
// Return a resource with an empty resource processor.
return returnResourceImpl(
Handle, [](ResourceHandleTy) { return Plugin::success(); });
}
protected:
/// Get multiple resources from the pool or create new ones. If the function
/// succeeds, the handles to the resources are saved in \p Handles. Also
/// process each of the obtained resources with \p Processor.
template <typename FuncTy>
Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles,
FuncTy Processor) {
const std::lock_guard<std::mutex> Lock(Mutex);
assert(NextAvailable <= ResourcePool.size() &&
"Resource pool is corrupted");
if (NextAvailable + Num > ResourcePool.size())
// Double the resource pool or resize it to provide the requested ones.
if (auto Err = ResourcePoolTy::resizeResourcePool(
std::max(NextAvailable * 2, NextAvailable + Num)))
return Err;
// Save the handles in the output array parameter.
for (uint32_t r = 0; r < Num; ++r)
Handles[r] = ResourcePool[NextAvailable + r];
// Process all obtained resources.
for (uint32_t r = 0; r < Num; ++r)
if (auto Err = Processor(Handles[r]))
return Err;
NextAvailable += Num;
return Plugin::success();
}
/// Return resource to the pool and process the resource with \p Processor.
template <typename FuncTy>
Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) {
const std::lock_guard<std::mutex> Lock(Mutex);
// Process the returned resource.
if (auto Err = Processor(Handle))
return Err;
assert(NextAvailable > 0 && "Resource pool is corrupted");
ResourcePool[--NextAvailable] = Handle;
return Plugin::success();
}
protected:
/// The resources between \p OldSize and \p NewSize need to be created or
/// destroyed. The mutex is locked when this function is called.
Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
assert(OldSize != NewSize && "Resizing to the same size");
if (auto Err = Device.setContext())
return Err;
if (OldSize < NewSize) {
// Create new resources.
for (uint32_t I = OldSize; I < NewSize; ++I) {
if (auto Err = ResourcePool[I].create(Device))
return Err;
}
} else {
// Destroy the obsolete resources.
for (uint32_t I = NewSize; I < OldSize; ++I) {
if (auto Err = ResourcePool[I].destroy(Device))
return Err;
}
}
return Plugin::success();
}
/// Increase or decrease the number of resources. This function should
/// be called with the mutex acquired.
Error resizeResourcePool(uint32_t NewSize) {
uint32_t OldSize = ResourcePool.size();
// Nothing to do.
if (OldSize == NewSize)
return Plugin::success();
if (OldSize < NewSize) {
// Increase the number of resources.
ResourcePool.resize(NewSize);
return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
}
// Decrease the number of resources otherwise.
auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
ResourcePool.resize(NewSize);
return Err;
}
/// The device to which the resources belong
GenericDeviceTy &Device;
/// Mutex for the resource pool.
std::mutex Mutex;
/// The next available resource in the pool.
uint32_t NextAvailable;
/// The actual resource pool.
std::deque<ResourceRef> ResourcePool;
};
/// A static check on whether or not we support RPC in libomptarget.
bool libomptargetSupportsRPC();
} // namespace plugin
} // namespace target
} // namespace omp
} // namespace llvm
#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H