chromium/native_client_sdk/src/examples/demo/life_simd/life.cc

// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>

#include <ppapi/c/ppb_input_event.h>
#include <ppapi/cpp/fullscreen.h>
#include <ppapi/cpp/input_event.h>
#include <ppapi/cpp/instance_handle.h>
#include <ppapi/cpp/var.h>
#include <ppapi/cpp/var_array.h>
#include <ppapi/cpp/var_array_buffer.h>
#include <ppapi/cpp/var_dictionary.h>

#include "ppapi_simple/ps.h"
#include "ppapi_simple/ps_context_2d.h"
#include "ppapi_simple/ps_event.h"
#include "ppapi_simple/ps_instance.h"
#include "ppapi_simple/ps_interface.h"
#include "sdk_util/macros.h"
#include "sdk_util/thread_pool.h"

using namespace sdk_util;  // For sdk_util::ThreadPool

namespace {

#define INLINE inline __attribute__((always_inline))

// BGRA helper macro, for constructing a pixel for a BGRA buffer.
#define MakeBGRA(b, g, r, a)  \
  (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))

const int kFramesToBenchmark = 100;
const int kCellAlignment = 0x10;

// 128 bit vector types
typedef uint8_t u8x16_t __attribute__ ((vector_size (16)));

// Helper function to broadcast x across 16 element vector.
INLINE u8x16_t broadcast(uint8_t x) {
  u8x16_t r = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
  return r;
}

// Convert a count value into a live (green) or dead color value.
const uint32_t kNeighborColors[] = {
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
    MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
    MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
    MakeBGRA(0x00, 0x00, 0x00, 0xFF),
};

// These represent the new health value of a cell based on its neighboring
// values.  The health is binary: either alive or dead.
const uint8_t kIsAlive[] = {
      0, 0, 0, 0, 0, 1, 1, 1, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0
};

// Timer helper for benchmarking.  Returns seconds elapsed since program start,
// as a double.
timeval start_tv;
int start_tv_retv = gettimeofday(&start_tv, NULL);

inline double getseconds() {
  const double usec_to_sec = 0.000001;
  timeval tv;
  if ((0 == start_tv_retv) && (0 == gettimeofday(&tv, NULL)))
    return (tv.tv_sec - start_tv.tv_sec) + tv.tv_usec * usec_to_sec;
  return 0.0;
}
} // namespace


class Life {
 public:
  Life();
  virtual ~Life();
  // Runs a tick of the simulations, update 2D output.
  void Update();
  // Handle event from user, or message from JS.
  void HandleEvent(PSEvent* ps_event);
 private:
  void UpdateContext();
  void DrawCell(int32_t x, int32_t y);
  void ProcessTouchEvent(const pp::TouchInputEvent& touches);
  void PostUpdateMessage(const char* message, double value);
  void StartBenchmark();
  void EndBenchmark();
  void Stir();
  void wSimulate(int y);
  static void wSimulateEntry(int y, void* data);
  void Simulate();

  bool simd_;
  bool multithread_;
  bool benchmarking_;
  int benchmark_frame_counter_;
  double bench_start_time_;
  double bench_end_time_;
  uint8_t* cell_in_;
  uint8_t* cell_out_;
  int32_t cell_stride_;
  int32_t width_;
  int32_t height_;
  PSContext2D_t* ps_context_;
  ThreadPool* workers_;
};

Life::Life() :
    simd_(true),
    multithread_(true),
    benchmarking_(false),
    benchmark_frame_counter_(0),
    bench_start_time_(0.0),
    bench_end_time_(0.0),
    cell_in_(NULL),
    cell_out_(NULL),
    cell_stride_(0),
    width_(0),
    height_(0) {
  ps_context_ = PSContext2DAllocate(PP_IMAGEDATAFORMAT_BGRA_PREMUL);
  // Query system for number of processors via sysconf()
  int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
  if (num_threads < 2)
    num_threads = 2;
  workers_ = new ThreadPool(num_threads);
  PSEventSetFilter(PSE_ALL);
}

Life::~Life() {
  delete workers_;
  PSContext2DFree(ps_context_);
}

void Life::UpdateContext() {
  cell_stride_ = (ps_context_->width + kCellAlignment - 1) &
      ~(kCellAlignment - 1);
  size_t size = cell_stride_ * ps_context_->height;

  if (ps_context_->width != width_ || ps_context_->height != height_) {
    free(cell_in_);
    free(cell_out_);

    // Create a new context
    void* in_buffer = NULL;
    void* out_buffer = NULL;
    // alloc buffers aligned on 16 bytes
    posix_memalign(&in_buffer, kCellAlignment, size);
    posix_memalign(&out_buffer, kCellAlignment, size);
    cell_in_ = (uint8_t*) in_buffer;
    cell_out_ = (uint8_t*) out_buffer;

    memset(cell_out_, 0, size);
    for (size_t index = 0; index < size; index++) {
      cell_in_[index] = rand() & 1;
    }
    width_ = ps_context_->width;
    height_ = ps_context_->height;
  }
}

void Life::DrawCell(int32_t x, int32_t y) {
  if (!cell_in_) return;
  if (x > 0 && x < ps_context_->width - 1 &&
      y > 0 && y < ps_context_->height - 1) {
    cell_in_[x - 1 + y * cell_stride_] = 1;
    cell_in_[x + 1 + y * cell_stride_] = 1;
    cell_in_[x + (y - 1) * cell_stride_] = 1;
    cell_in_[x + (y + 1) * cell_stride_] = 1;
  }
}

void Life::ProcessTouchEvent(const pp::TouchInputEvent& touches) {
  uint32_t count = touches.GetTouchCount(PP_TOUCHLIST_TYPE_TOUCHES);
  uint32_t i, j;
  for (i = 0; i < count; i++) {
    pp::TouchPoint touch =
        touches.GetTouchByIndex(PP_TOUCHLIST_TYPE_TOUCHES, i);
    int radius = (int)(touch.radii().x());
    int x = (int)(touch.position().x());
    int y = (int)(touch.position().y());
    // num = 1/100th the area of touch point
    uint32_t num = (uint32_t)(M_PI * radius * radius / 100.0f);
    for (j = 0; j < num; j++) {
      int dx = rand() % (radius * 2) - radius;
      int dy = rand() % (radius * 2) - radius;
      // only plot random cells within the touch area
      if (dx * dx + dy * dy <= radius * radius)
        DrawCell(x + dx, y + dy);
    }
  }
}

void Life::PostUpdateMessage(const char* message_name, double value) {
  pp::VarDictionary message;
  message.Set("message", message_name);
  message.Set("value", value);
  PSInterfaceMessaging()->PostMessage(PSGetInstanceId(), message.pp_var());
}

void Life::StartBenchmark() {
  printf("Running benchmark... (SIMD: %s, multi-threading: %s, size: %dx%d)\n",
      simd_ ? "enabled" : "disabled",
      multithread_ ? "enabled" : "disabled",
      ps_context_->width,
      ps_context_->height);
  benchmarking_ = true;
  bench_start_time_ = getseconds();
  benchmark_frame_counter_ = kFramesToBenchmark;
}

void Life::EndBenchmark() {
  double total_time;
  bench_end_time_ = getseconds();
  benchmarking_ = false;
  total_time = bench_end_time_ - bench_start_time_;
  printf("Finished - benchmark took %f seconds\n", total_time);
  // Send benchmark result to JS.
  PostUpdateMessage("benchmark_result", total_time);
}

void Life::HandleEvent(PSEvent* ps_event) {
  // Give the 2D context a chance to process the event.
  if (0 != PSContext2DHandleEvent(ps_context_, ps_event)) {
    UpdateContext();
    return;
  }

  switch(ps_event->type) {

    case PSE_INSTANCE_HANDLEINPUT: {
      pp::InputEvent event(ps_event->as_resource);

      switch(event.GetType()) {
        case PP_INPUTEVENT_TYPE_MOUSEDOWN:
        case PP_INPUTEVENT_TYPE_MOUSEMOVE: {
          pp::MouseInputEvent mouse = pp::MouseInputEvent(event);
          // If the button is down, draw
          if (mouse.GetModifiers() & PP_INPUTEVENT_MODIFIER_LEFTBUTTONDOWN) {
            PP_Point location = mouse.GetPosition();
            DrawCell(location.x, location.y);
          }
          break;
        }

        case PP_INPUTEVENT_TYPE_TOUCHSTART:
        case PP_INPUTEVENT_TYPE_TOUCHMOVE: {
          pp::TouchInputEvent touches = pp::TouchInputEvent(event);
          ProcessTouchEvent(touches);
          break;
        }

        case PP_INPUTEVENT_TYPE_KEYDOWN: {
          pp::Fullscreen fullscreen((pp::InstanceHandle(PSGetInstanceId())));
          bool isFullscreen = fullscreen.IsFullscreen();
          fullscreen.SetFullscreen(!isFullscreen);
          break;
        }

        default:
          break;
      }
      break;  // case PSE_INSTANCE_HANDLEINPUT
    }

    case PSE_INSTANCE_HANDLEMESSAGE: {
      // Convert Pepper Simple message to PPAPI C++ vars
      pp::Var var(ps_event->as_var);
      if (var.is_dictionary()) {
        pp::VarDictionary dictionary(var);
        std::string message = dictionary.Get("message").AsString();
        if (message == "run_benchmark" && !benchmarking_) {
          StartBenchmark();
        } else if (message == "set_simd") {
          simd_ = dictionary.Get("value").AsBool();
        } else if (message == "set_threading") {
          multithread_ = dictionary.Get("value").AsBool();
        }
      }
      break;  // case PSE_INSTANCE_HANDLEMESSAGE
    }

    default:
      break;
  }
}

void Life::Stir() {
  int32_t width = ps_context_->width;
  int32_t height = ps_context_->height;
  int32_t stride = cell_stride_;
  int32_t i;
  if (cell_in_ == NULL || cell_out_ == NULL)
    return;

  for (i = 0; i < width; ++i) {
    cell_in_[i] = rand() & 1;
    cell_in_[i + (height - 1) * stride] = rand() & 1;
  }
  for (i = 0; i < height; ++i) {
    cell_in_[i * stride] = rand() & 1;
    cell_in_[i * stride + (width - 1)] = rand() & 1;
  }
}

void Life::wSimulate(int y) {
  // Don't run simulation on top and bottom borders
  if (y < 1 || y >= ps_context_->height - 1)
    return;

  // Do neighbor summation; apply rules, output pixel color. Note that a 1 cell
  // wide perimeter is excluded from the simulation update; only cells from
  // x = 1 to x < width - 1 and y = 1 to y < height - 1 are updated.
  uint8_t *src0 = (cell_in_ + (y - 1) * cell_stride_);
  uint8_t *src1 = src0 + cell_stride_;
  uint8_t *src2 = src1 + cell_stride_;
  uint8_t *dst = (cell_out_ + y * cell_stride_) + 1;
  uint32_t *pixels = static_cast<uint32_t *>(ps_context_->data);
  uint32_t *pixel_line = // static_cast<uint32_t*>
      (pixels + y * ps_context_->stride / sizeof(uint32_t));
  int32_t x = 1;

  if (simd_) {
    const u8x16_t kOne = broadcast(1);
    const u8x16_t kFour = broadcast(4);
    const u8x16_t kEight = broadcast(8);
    const u8x16_t kZero255 = {0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    // Prime the src
    u8x16_t src00 = *reinterpret_cast<u8x16_t*>(&src0[0]);
    u8x16_t src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
    u8x16_t src10 = *reinterpret_cast<u8x16_t*>(&src1[0]);
    u8x16_t src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
    u8x16_t src20 = *reinterpret_cast<u8x16_t*>(&src2[0]);
    u8x16_t src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);

    // This inner loop is SIMD - each loop iteration will process 16 cells.
    for (; (x + 15) < (ps_context_->width - 1); x += 16) {

      // Construct jittered source temps, using __builtin_shufflevector(..) to
      // extract a shifted 16 element vector from the 32 element concatenation
      // of two source vectors.
      u8x16_t src0j0 = src00;
      u8x16_t src0j1 = __builtin_shufflevector(src00, src01,
          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
      u8x16_t src0j2 = __builtin_shufflevector(src00, src01,
          2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
      u8x16_t src1j0 = src10;
      u8x16_t src1j1 = __builtin_shufflevector(src10, src11,
          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
      u8x16_t src1j2 = __builtin_shufflevector(src10, src11,
          2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
      u8x16_t src2j0 = src20;
      u8x16_t src2j1 = __builtin_shufflevector(src20, src21,
          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
      u8x16_t src2j2 = __builtin_shufflevector(src20, src21,
          2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);

      // Sum the jittered sources to construct neighbor count.
      u8x16_t count = src0j0 + src0j1 +  src0j2 +
                      src1j0 +        +  src1j2 +
                      src2j0 + src2j1 +  src2j2;
      // Add the center cell.
      count = count + count + src1j1;
      // If count > 4 and < 8, center cell will be alive in the next frame.
      u8x16_t alive1 = count > kFour;
      u8x16_t alive2 = count < kEight;
      // Intersect the two comparisons from above.
      u8x16_t alive = alive1 & alive2;

      // At this point, alive[x] will be one of two values:
      //   0x00 for a dead cell
      //   0xFF for an alive cell.
      //
      // Next, convert alive cells to green pixel color.
      // Use __builtin_shufflevector(..) to construct output pixels from
      // concantination of alive vector and kZero255 const vector.
      //   Indices 0..15 select the 16 cells from alive vector.
      //   Index 16 is zero constant from kZero255 constant vector.
      //   Index 17 is 255 constant from kZero255 constant vector.
      //   Output pixel color values are in BGRABGRABGRABGRA order.
      // Since each pixel needs 4 bytes of color information, 16 cells will
      // need to expand to 4 separate 16 byte pixel splats.
      u8x16_t pixel0_3 = __builtin_shufflevector(alive, kZero255,
        16, 0, 16, 17, 16, 1, 16, 17, 16, 2, 16, 17, 16, 3, 16, 17);
      u8x16_t pixel4_7 = __builtin_shufflevector(alive, kZero255,
        16, 4, 16, 17, 16, 5, 16, 17, 16, 6, 16, 17, 16, 7, 16, 17);
      u8x16_t pixel8_11 = __builtin_shufflevector(alive, kZero255,
        16, 8, 16, 17, 16, 9, 16, 17, 16, 10, 16, 17, 16, 11, 16, 17);
      u8x16_t pixel12_15 = __builtin_shufflevector(alive, kZero255,
        16, 12, 16, 17, 16, 13, 16, 17, 16, 14, 16, 17, 16, 15, 16, 17);

      // Write 16 pixels to output pixel buffer.
      *reinterpret_cast<u8x16_t*>(pixel_line + 0) = pixel0_3;
      *reinterpret_cast<u8x16_t*>(pixel_line + 4) = pixel4_7;
      *reinterpret_cast<u8x16_t*>(pixel_line + 8) = pixel8_11;
      *reinterpret_cast<u8x16_t*>(pixel_line + 12) = pixel12_15;

      // Convert alive mask to 1 or 0 and store in destination cell array.
      *reinterpret_cast<u8x16_t*>(dst) = alive & kOne;

      // Increment pointers.
      pixel_line += 16;
      dst += 16;
      src0 += 16;
      src1 += 16;
      src2 += 16;

      // Shift source over by 16 cells and read the next 16 cells.
      src00 = src01;
      src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
      src10 = src11;
      src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
      src20 = src21;
      src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);
    }
  }

  // The SIMD loop above does 16 cells at a time.  The loop below is the
  // regular version which processes one cell at a time.  It is used to
  // finish the remainder of the scanline not handled by the SIMD loop.
  for (; x < (ps_context_->width - 1); ++x) {
    // Sum the jittered sources to construct neighbor count.
    int count = src0[0] + src0[1] + src0[2] +
                src1[0] +         + src1[2] +
                src2[0] + src2[1] + src2[2];
    // Add the center cell.
    count = count + count + src1[1];
    // Use table lookup indexed by count to determine pixel & alive state.
    uint32_t color = kNeighborColors[count];
    *pixel_line++ = color;
    *dst++ = kIsAlive[count];
    ++src0;
    ++src1;
    ++src2;
  }
}

// Static entry point for worker thread.
void Life::wSimulateEntry(int slice, void* thiz) {
  static_cast<Life*>(thiz)->wSimulate(slice);
}

void Life::Simulate() {
  // Stir up the edges to prevent the simulation from reaching steady state.
  Stir();

  if (multithread_) {
    // If multi-threading enabled, dispatch tasks to pool of worker threads.
    workers_->Dispatch(ps_context_->height, wSimulateEntry, this);
  } else {
    // Else manually simulate each line on this thread.
    for (int y = 0; y < ps_context_->height; y++) {
      wSimulateEntry(y, this);
    }
  }
  std::swap(cell_in_, cell_out_);
}

void Life::Update() {

  PSContext2DGetBuffer(ps_context_);
  if (NULL == ps_context_->data)
    return;

  // If we somehow have not allocated these pointers yet, skip this frame.
  if (!cell_in_ || !cell_out_) return;

  // Simulate one (or more if benchmarking) frames
  do {
    Simulate();
    if (!benchmarking_)
      break;
    --benchmark_frame_counter_;
  } while(benchmark_frame_counter_ > 0);
  if (benchmarking_)
    EndBenchmark();

 PSContext2DSwapBuffer(ps_context_);
}

// Starting point for the module.  We do not use main since it would
// collide with main in libppapi_cpp.
int main(int argc, char* argv[]) {
  Life life;
  while (true) {
    PSEvent* ps_event;
    // Consume all available events
    while ((ps_event = PSEventTryAcquire()) != NULL) {
      life.HandleEvent(ps_event);
      PSEventRelease(ps_event);
    }
    // Do simulation, render and present.
    life.Update();
  }
  return 0;
}