chromium/device/vr/openxr/android/openxr_depth_sensor_android.cc

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/351564777): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "device/vr/openxr/android/openxr_depth_sensor_android.h"

#include <array>
#include <memory>
#include <set>

#include "base/containers/contains.h"
#include "base/containers/fixed_flat_map.h"
#include "base/containers/flat_set.h"
#include "base/no_destructor.h"
#include "base/numerics/safe_conversions.h"
#include "base/time/time.h"
#include "device/vr/openxr/openxr_extension_helper.h"
#include "device/vr/openxr/openxr_util.h"
#include "device/vr/openxr/openxr_view_configuration.h"
#include "device/vr/public/mojom/vr_service.mojom.h"
#include "device/vr/public/mojom/xr_session.mojom.h"
#include "third_party/openxr/dev/xr_android.h"
#include "third_party/openxr/src/include/openxr/openxr.h"

namespace device {

namespace {
// The spec essentially requires that the depth views line up with the
// XR_VIEW_CONFIGURATION_TYPE_PRIMARY_STEREO type, off of which we define these
// constants. This provides an extra layer of security in case we migrate types
// or anything else that this class is kept up-to-date.
static_assert(kNumPrimaryViews == 2);
static_assert(kLeftView == 0);
static_assert(kRightView == 1);

// Returns the index in the |XrDepthAcquireResultANDROID| |views| member for the
// requested eye per the specification.
size_t GetDepthViewIndex(const mojom::XREye& eye) {
  CHECK(eye == mojom::XREye::kLeft || eye == mojom::XREye::kRight);
  if (eye == mojom::XREye::kLeft) {
    return kLeftView;
  }

  return kRightView;
}

// The depthImage from OpenXR consists of two sets of pixels one after the
// other. The first num_pixels floats in depthImage are the left eye, and the
// pixels after that are the right eye.
size_t GetDepthImageOffset(const mojom::XREye& eye, size_t pixels_per_image) {
  CHECK(eye == mojom::XREye::kLeft || eye == mojom::XREye::kRight);
  if (eye == mojom::XREye::kRight) {
    return pixels_per_image;
  }

  return 0;
}

// A map of the resolutions that we support to a gfx::Size, since we ultimately
// need to send that across mojom, and allows us to properly handle whichever
// size we get back from the API.
constexpr auto kResolutionSizeMap =
    base::MakeFixedFlatMap<XrDepthCameraResolutionANDROID, gfx::Size>(
        {{XR_DEPTH_CAMERA_RESOLUTION_80x80_ANDROID, {80, 80}},
         {XR_DEPTH_CAMERA_RESOLUTION_160x160_ANDROID, {160, 160}},
         {XR_DEPTH_CAMERA_RESOLUTION_320x320_ANDROID, {320, 320}}});

constexpr std::array<XrDepthCameraResolutionANDROID, 3> kResolutionPreferences{
    XR_DEPTH_CAMERA_RESOLUTION_320x320_ANDROID,
    XR_DEPTH_CAMERA_RESOLUTION_160x160_ANDROID,
    XR_DEPTH_CAMERA_RESOLUTION_80x80_ANDROID};
static_assert(kResolutionSizeMap.size() == kResolutionPreferences.size(),
              "Need to have a corresponding resolution size for every "
              "preferred resolution that we can support");

constexpr size_t GetByteSize(const mojom::XRDepthDataFormat& format) {
  switch (format) {
    case mojom::XRDepthDataFormat::kLuminanceAlpha:
    case mojom::XRDepthDataFormat::kUnsignedShort:
      return sizeof(uint16_t);
    case mojom::XRDepthDataFormat::kFloat32:
      return sizeof(float);
  }
}
static_assert(sizeof(uint16_t) ==
              GetByteSize(mojom::XRDepthDataFormat::kLuminanceAlpha));
static_assert(sizeof(uint16_t) ==
              GetByteSize(mojom::XRDepthDataFormat::kUnsignedShort));

// Essentially this returns the projection matrix for a given camera. Screen
// coordinates appear to need to be in clip space, e.g. [-1,1]. "Camera Space",
// conforms to space expectations compatible with other transforms used
// throughout the runtime and references a space with the camera location as the
// origin.
gfx::Transform GetScreenFromCamera(const mojom::VRFieldOfViewPtr& fov) {
  constexpr float near_depth = 0.0001;
  constexpr float far_depth = 10000;
  constexpr double kDegToRad = M_PI / 180.0;

  float up_rad = fov->up_degrees * kDegToRad;
  float down_rad = fov->down_degrees * kDegToRad;
  float left_rad = fov->left_degrees * kDegToRad;
  float right_rad = fov->right_degrees * kDegToRad;

  float up_tan = tanf(up_rad);
  float down_tan = tanf(down_rad);
  float left_tan = tanf(left_rad);
  float right_tan = tanf(right_rad);
  float x_scale = 2.0f / (left_tan + right_tan);
  float y_scale = 2.0f / (up_tan + down_tan);
  float inv_nf = 1.0f / (near_depth - far_depth);

  return gfx::Transform::ColMajor(
      x_scale, 0.0f, 0.0f, 0.0f, 0.0f, y_scale, 0.0f, 0.0f,
      -((left_tan - right_tan) * x_scale * 0.5),
      ((up_tan - down_tan) * y_scale * 0.5), (near_depth + far_depth) * inv_nf,
      -1.0f, 0.0f, 0.0f, (2.0f * far_depth * near_depth) * inv_nf, 0.0f);
}

// Converts an array coordinate value [0,size) to a texture coordinate [0, 1].
inline float ToTexCoord(float val, float size) {
  return (val + 0.5f) / size;
}

// Converts a texture coordinate [0,1] to "clip space" [-1, 1]. This is a
// necessary conversion when transforming a point through a projection matrix
// (screen_from_foo or foo_from_screen) in our normal terminology.
inline float ToClipSpace(float val) {
  return 2.0f * val - 1.0f;
}

// Converts from "clip space" [-1, 1] to texture coordinate space [0,1]. This is
// a necessary conversion to map a point transformed through a projection matrix
// back to something that can be used to sample a texture.
inline float FromClipSpace(float val) {
  return (val + 1.0f) / 2.0f;
}

inline size_t buffer_location(size_t col, size_t row, size_t row_size) {
  return row * row_size + col;
}

template <typename T>
inline void WriteToSpanStart(base::span<uint8_t> output, T val) {
  output.first<sizeof(T)>().copy_from(base::byte_span_from_ref(val));
}

// Helper function to copy depth data on the CPU. This expects to receive the
// raw array of data received from the OpenXr API and will convert it to an
// array of the same size. This function is responsible for mapping a point from
// the "pixel" it would occupy in the output buffer to sample the corresponding
// point in the depth buffer by applying all required transforms. After the
// float value is sampled, it will apply |conversion_fn| to map from float to
// |T| to assign it to the output array.
template <typename T, typename FunctionType>
void CopyDepthData(base::span<const float> input,
                   base::span<uint8_t> output,
                   gfx::Size image_size,
                   XrDepthViewANDROID depth_view,
                   const mojom::XRViewPtr& view,
                   FunctionType&& conversion_fn) {
  // We should've handled an invalid image_size before getting to this point.
  size_t num_pixels;
  CHECK(image_size.GetCheckedArea().AssignIfValid(&num_pixels));
  CHECK_EQ(input.size(), num_pixels);
  CHECK_EQ(output.size_bytes(), num_pixels * sizeof(T));

  // Extract width/height for readability (and to use size_t).
  const size_t width = image_size.width();
  const size_t height = image_size.height();
  const gfx::Transform view_from_eye_screen =
      GetScreenFromCamera(view->field_of_view).GetCheckedInverse();
  const gfx::Transform depth_screen_from_depth =
      GetScreenFromCamera(XrFovToMojomFov(depth_view.fov));

  // Depth pose is initially local_from_depth (based on passing local space
  // into the object upon creation).
  // TOOD(crbug.com/40684534): Create local_from_mojom transformations.
  const gfx::Transform local_from_mojom;
  const auto depth_from_mojom =
      XrPoseToGfxTransform(depth_view.pose).GetCheckedInverse() *
      local_from_mojom;
  const auto& mojom_from_view = view->mojo_from_view;
  const gfx::Transform depth_screen_from_eye_screen =
      depth_screen_from_depth * depth_from_mojom * mojom_from_view *
      view_from_eye_screen;
  for (size_t y = 0; y < height; y++) {
    for (size_t x = 0; x < width; x++) {
      // Assign a z value of 1 to convert from cartesian (screen) coordinates to
      // a homogeneous Euclidean (2D) coordinate space.
      const gfx::Point3F eye_screen_clip_coord{
          ToClipSpace(ToTexCoord(x, width)), ToClipSpace(ToTexCoord(y, height)),
          1};
      const gfx::Point3F depth_screen_clip_coord =
          depth_screen_from_eye_screen.MapPoint(eye_screen_clip_coord);

      const gfx::PointF depth_screen_texture_coord(
          FromClipSpace(depth_screen_clip_coord.x()),
          FromClipSpace(depth_screen_clip_coord.y()));

      // If x or y is less than 0 it's out of bounds and we should ignore it.
      // We'll convert back to whole buffer coordinates before checking the
      // width and height.
      if (depth_screen_texture_coord.x() < 0 ||
          depth_screen_texture_coord.y() < 0) {
        // We need to ensure that the whole span gets initialized.
        WriteToSpanStart(output, T());
        // Advance the span so that the start is the next uninitialized spot.
        output = output.subspan(sizeof(T));
        continue;
      }

      const gfx::PointF depth_screen_buffer_coord =
          gfx::ScalePoint(depth_screen_texture_coord, width, height);

      // We've already verified that these values can't be negative, so we can
      // safely convert to size_t now.
      // Anything from N.0 to N.999... should be treated as belonging to the
      // pixel originating at N. The previous addition of 0.5 helped to ensure
      // accuracy by forcing us to sample the value that the middle of the pixel
      // should be, as such it would be inappropriate to subtract the 0.5 again
      // as that might force us to sample a different pixel than where our
      // centerpoint should be. This static_cast from float to size_t
      // essentially is equivalent to truncation to leave us with N.
      const size_t depth_y = static_cast<size_t>(depth_screen_buffer_coord.y());
      const size_t depth_x = static_cast<size_t>(depth_screen_buffer_coord.x());

      // If the new point is out of bounds, ignore it.
      // Note that we do this part of the bounds check after the conversion from
      // float to size_t to ensure accuracy of the conversion.
      if (depth_x >= width || depth_y >= height) {
        // We need to ensure that the whole span gets initialized.
        WriteToSpanStart(output, T());
        // Advance the span so that the start is the next uninitialized spot.
        output = output.subspan(sizeof(T));
        continue;
      }

      float depth_value = input[buffer_location(depth_x, depth_y, width)];

      // The continuous `subspan` calls will essentially keep advancing output
      // through the underlying data structure for the span so that the first
      // sizeof(T) bytes are also the next unwritten bytes and correspond to
      // our current x/y "spot".
      WriteToSpanStart(output, conversion_fn(depth_value));

      // Advance the span so that the start is the next uninitialized spot.
      output = output.subspan(sizeof(T));
    }
  }

  // Since we've been advancing the span the whole time and already verified
  // that the originally passed in output span is the same size as the input, we
  // should now be at the end of the span we received, which means that output
  // should be empty.
  CHECK(output.empty());
}
}  // namespace

OpenXrDepthSensorAndroid::OpenXrDepthSensorAndroid(
    const OpenXrExtensionHelper& extension_helper,
    XrSession session,
    XrSpace mojo_space,
    const mojom::XRDepthOptions& depth_options)
    : extension_helper_(extension_helper),
      session_(session),
      mojo_space_(mojo_space) {
  DVLOG(1) << __func__;
  // We can only support CPU optimized depth, so we can only support depth if
  // either no preferences were specified or if cpu-optimized was specified.
  const auto& usage_preferences = depth_options.usage_preferences;
  const bool can_support_depth =
      usage_preferences.empty() ||
      base::Contains(usage_preferences, mojom::XRDepthUsage::kCPUOptimized);

  if (can_support_depth) {
    depth_config_ = mojom::XRDepthConfig::New();
    depth_config_->depth_usage = mojom::XRDepthUsage::kCPUOptimized;

    // We can support all of the current data formats, so just grab the first if
    // they were specified, and if none were, use float32 (our native type).
    static_assert(static_cast<int>(mojom::XRDepthDataFormat::kMaxValue) == 3);
    if (!depth_options.data_format_preferences.empty()) {
      depth_config_->depth_data_format =
          depth_options.data_format_preferences[0];
    } else {
      depth_config_->depth_data_format = mojom::XRDepthDataFormat::kFloat32;
    }
  } else {
    DVLOG(1) << __func__ << " Cannot support depth";
  }
}

OpenXrDepthSensorAndroid::~OpenXrDepthSensorAndroid() {
  DVLOG(1) << __func__;
  if (swapchain_ != XR_NULL_HANDLE) {
    // In the (likely) event that the session has been destroyed before us, this
    // will fail. So just ignore the result returned here.
    extension_helper_->ExtensionMethods().xrDestroyDepthSwapchainANDROID(
        swapchain_);

    swapchain_ = XR_NULL_HANDLE;
  }

  depth_images_.clear();
}

XrResult OpenXrDepthSensorAndroid::Initialize() {
  DVLOG(1) << __func__;
  if (initialized_) {
    return XR_SUCCESS;
  }

  if (!depth_config_) {
    return XR_ERROR_FEATURE_UNSUPPORTED;
  }

  uint32_t supported_resolutions_count;
  RETURN_IF_XR_FAILED(
      extension_helper_->ExtensionMethods().xrEnumerateDepthResolutionsANDROID(
          session_, 0, &supported_resolutions_count, nullptr));

  std::vector<XrDepthCameraResolutionANDROID> supported_resolutions(
      supported_resolutions_count, XR_DEPTH_CAMERA_RESOLUTION_MAX_ENUM_ANDROID);
  RETURN_IF_XR_FAILED(
      extension_helper_->ExtensionMethods().xrEnumerateDepthResolutionsANDROID(
          session_, supported_resolutions_count, &supported_resolutions_count,
          supported_resolutions.data()));

  // Realistically this should never happen, but since it theoretically can,
  // it shouldn't be a CHECK.
  if (supported_resolutions_count != supported_resolutions.size()) {
    LOG(ERROR) << __func__
               << " Supported resolution size changed during creation";
    return XR_ERROR_INITIALIZATION_FAILED;
  }

  auto it = base::ranges::find_if(
      kResolutionPreferences.begin(), kResolutionPreferences.end(),
      [&supported_resolutions](
          const XrDepthCameraResolutionANDROID& resolution) {
        return base::Contains(supported_resolutions, resolution);
      });

  if (it == kResolutionPreferences.end()) {
    DLOG(ERROR) << __func__ << " No Supported Depth Resolution";
    return XR_ERROR_INITIALIZATION_FAILED;
  }

  depth_camera_resolution_ = *it;

  XrDepthSwapchainCreateInfoANDROID swapchain_create_info{
      XR_TYPE_DEPTH_SWAPCHAIN_CREATE_INFO_ANDROID};
  swapchain_create_info.resolution = depth_camera_resolution_;
  swapchain_create_info.createFlags =
      XR_DEPTH_SWAPCHAIN_CREATE_RAW_DEPTH_IMAGE_BIT_ANDROID;
  RETURN_IF_XR_FAILED(
      extension_helper_->ExtensionMethods().xrCreateDepthSwapchainANDROID(
          session_, &swapchain_create_info, &swapchain_));

  uint32_t image_count_output = 0;
  RETURN_IF_XR_FAILED(extension_helper_->ExtensionMethods()
                          .xrEnumerateDepthSwapchainImagesANDROID(
                              swapchain_, 0, &image_count_output, nullptr));

  depth_images_.resize(image_count_output);
  for (auto& image : depth_images_) {
    image.type = XR_TYPE_DEPTH_SWAPCHAIN_IMAGE_ANDROID;
  }

  RETURN_IF_XR_FAILED(extension_helper_->ExtensionMethods()
                          .xrEnumerateDepthSwapchainImagesANDROID(
                              swapchain_, depth_images_.size(),
                              &image_count_output, depth_images_.data()));

  // Realistically this should never happen, but since it theoretically can,
  // it shouldn't be a CHECK.
  if (image_count_output != depth_images_.size()) {
    LOG(ERROR) << __func__ << " Swapchain size changed during creation";
    return XR_ERROR_INITIALIZATION_FAILED;
  }

  initialized_ = true;
  return XR_SUCCESS;
}

mojom::XRDepthConfigPtr OpenXrDepthSensorAndroid::GetDepthConfig() {
  return depth_config_ ? depth_config_.Clone() : nullptr;
}

void OpenXrDepthSensorAndroid::PopulateDepthData(
    XrTime frame_time,
    const std::vector<mojom::XRViewPtr>& views) {
  DVLOG(3) << __func__;
  // We could fail to be initialized if depth isn't actually supported.
  if (!initialized_) {
    DVLOG(3) << __func__ << " Not initialized";
    return;
  }

  if (views.size() < kNumPrimaryViews ||
      views[kLeftView]->eye != mojom::XREye::kLeft ||
      views[kRightView]->eye != mojom::XREye::kRight) {
    DLOG(ERROR) << __func__ << " Incorrect eye configuration";
    return;
  }

  XrDepthAcquireInfoANDROID acquire_info = {XR_TYPE_DEPTH_ACQUIRE_INFO_ANDROID};
  acquire_info.space = mojo_space_;
  acquire_info.displayTime = frame_time;

  XrDepthAcquireResultANDROID acquire_result = {
      XR_TYPE_DEPTH_ACQUIRE_RESULT_ANDROID};
  XrResult result = extension_helper_->ExtensionMethods()
                        .xrAcquireDepthSwapchainImagesANDROID(
                            swapchain_, &acquire_info, &acquire_result);
  if (XR_FAILED(result)) {
    DLOG(ERROR) << __func__
                << " Failed to acquire depth swapchain images: " << result;
    return;
  }

  if (acquire_result.acquiredIndex >= depth_images_.size()) {
    DLOG(ERROR) << __func__ << " Acquired Index was out of bounds: "
                << acquire_result.acquiredIndex << " vs "
                << depth_images_.size();
    return;
  }

  for (size_t i = 0; i < kNumPrimaryViews; i++) {
    views[i]->depth_data = GetDepthDataForEye(acquire_result, views[i]);
  }
}

mojom::XRDepthDataPtr OpenXrDepthSensorAndroid::GetDepthDataForEye(
    const XrDepthAcquireResultANDROID& acquire_result,
    const mojom::XRViewPtr& view) {
  const auto& eye = view->eye;
  DVLOG(3) << __func__ << " eye: " << eye;
  CHECK(eye == mojom::XREye::kLeft || eye == mojom::XREye::kRight);
  auto& depth_image = depth_images_[acquire_result.acquiredIndex];

  const auto& image_size = kResolutionSizeMap.at(depth_camera_resolution_);
  size_t num_pixels;
  if (!image_size.GetCheckedArea().AssignIfValid(&num_pixels)) {
    DLOG(ERROR) << __func__ << " Image size overflowed";
    return nullptr;
  }

  const auto& data_format = depth_config_->depth_data_format;
  size_t buffer_size;
  if (!base::CheckMul<size_t>(GetByteSize(data_format), num_pixels)
           .AssignIfValid(&buffer_size)) {
    DLOG(ERROR) << __func__ << " Buffer size overflowed";
    return nullptr;
  }

  XrDepthViewANDROID depth_view = acquire_result.views[GetDepthViewIndex(eye)];
  size_t pixel_offset = GetDepthImageOffset(eye, num_pixels);
  base::span<const float> raw_depth_image =
      base::span(depth_image.rawDepthImage + pixel_offset, num_pixels);

  mojom::XRDepthDataUpdatedPtr result = mojom::XRDepthDataUpdated::New();
  mojo_base::BigBuffer pixels(buffer_size);
  switch (depth_config_->depth_data_format) {
    case mojom::XRDepthDataFormat::kFloat32:
      // Results are already in meters.
      CHECK(GetByteSize(data_format) == sizeof(float));
      CopyDepthData<float>(raw_depth_image, pixels, image_size, depth_view,
                           view, [](float val) { return val; });
      break;
    // Luminance alpha needs to be converted
    case mojom::XRDepthDataFormat::kLuminanceAlpha:
    case mojom::XRDepthDataFormat::kUnsignedShort:
      // We'll be converting to millimeters.
      result->raw_value_to_meters = 1 / 1000.0f;

      CHECK(GetByteSize(data_format) == sizeof(uint16_t));
      CopyDepthData<uint16_t>(
          raw_depth_image, pixels, image_size, depth_view, view, [](float val) {
            // val is in meters, so convert to mm to avoid losing precision.
            return base::saturated_cast<uint16_t>(std::nearbyint(val * 1000));
          });
      break;
  }

  result->pixel_data = std::move(pixels);
  result->size = image_size;
  return mojom::XRDepthData::NewUpdatedDepthData(std::move(result));
}

OpenXrDepthSensorAndroidFactory::OpenXrDepthSensorAndroidFactory() = default;
OpenXrDepthSensorAndroidFactory::~OpenXrDepthSensorAndroidFactory() = default;

const base::flat_set<std::string_view>&
OpenXrDepthSensorAndroidFactory::GetRequestedExtensions() const {
  static base::NoDestructor<base::flat_set<std::string_view>> kExtensions(
      {XR_ANDROID_DEPTH_TEXTURE_EXTENSION_NAME});
  return *kExtensions;
}

std::set<device::mojom::XRSessionFeature>
OpenXrDepthSensorAndroidFactory::GetSupportedFeatures(
    const OpenXrExtensionEnumeration* extension_enum) const {
  if (!IsEnabled(extension_enum)) {
    return {};
  }

  return {device::mojom::XRSessionFeature::DEPTH};
}

void OpenXrDepthSensorAndroidFactory::ProcessSystemProperties(
    const OpenXrExtensionEnumeration* extension_enum,
    XrInstance instance,
    XrSystemId system) {
  XrSystemDepthTrackingPropertiesANDROID depth_properties{
      XR_TYPE_SYSTEM_DEPTH_TRACKING_PROPERTIES_ANDROID};

  XrSystemProperties system_properties{XR_TYPE_SYSTEM_PROPERTIES};
  system_properties.next = &depth_properties;

  bool depth_supported = false;
  XrResult result = xrGetSystemProperties(instance, system, &system_properties);
  if (XR_SUCCEEDED(result)) {
    depth_supported = depth_properties.supportsDepthTracking;
  }

  SetSystemPropertiesSupport(depth_supported);
}

std::unique_ptr<OpenXrDepthSensor>
OpenXrDepthSensorAndroidFactory::CreateDepthSensor(
    const OpenXrExtensionHelper& extension_helper,
    XrSession session,
    XrSpace mojo_space,
    const mojom::XRDepthOptions& depth_options) const {
  bool is_supported = IsEnabled(extension_helper.ExtensionEnumeration());
  DVLOG(2) << __func__ << " is_supported=" << is_supported;
  if (is_supported) {
    return std::make_unique<OpenXrDepthSensorAndroid>(
        extension_helper, session, mojo_space, depth_options);
  }

  return nullptr;
}

}  // namespace device