llvm/flang/runtime/utf.cpp

//===-- runtime/utf.cpp ---------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "utf.h"

namespace Fortran::runtime {

#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
// clang-format off
RT_OFFLOAD_VAR_GROUP_BEGIN
const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
  /* 00 - 7F:  7 bit payload in single byte */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  /* 80 - BF: invalid first byte, valid later byte */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  /* C0 - DF: 11 bit payload */
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  /* E0 - EF: 16 bit payload */
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
  /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
  /* FC - FD: 31 bit payload */ 6, 6,
  /* FE:      32 bit payload */ 7,
  /* FF:      invalid */ 0
};
RT_OFFLOAD_VAR_GROUP_END
// clang-format on
#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS

RT_OFFLOAD_API_GROUP_BEGIN

std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
  // Scan back over UTF-8 continuation bytes, if any
  for (std::size_t n{1}; n <= limit; ++n) {
    if ((end[-n] & 0xc0) != 0x80) {
      return n;
    }
  }
  return limit;
}

// Non-minimal encodings are accepted.
Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
  const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
  std::size_t bytes{MeasureUTF8Bytes(*p0)};
  if (bytes == 1) {
    return char32_t{*p};
  } else if (bytes > 1) {
    std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
    for (std::size_t j{1}; j < bytes; ++j) {
      std::uint8_t next{p[j]};
      if (next < 0x80 || next > 0xbf) {
        return Fortran::common::nullopt;
      }
      result = (result << 6) | (next & 0x3f);
    }
    if (result <= 0xffffffff) {
      return static_cast<char32_t>(result);
    }
  }
  return Fortran::common::nullopt;
}

std::size_t EncodeUTF8(char *p0, char32_t ucs) {
  std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
  if (ucs <= 0x7f) {
    p[0] = ucs;
    return 1;
  } else if (ucs <= 0x7ff) {
    p[0] = 0xc0 | (ucs >> 6);
    p[1] = 0x80 | (ucs & 0x3f);
    return 2;
  } else if (ucs <= 0xffff) {
    p[0] = 0xe0 | (ucs >> 12);
    p[1] = 0x80 | ((ucs >> 6) & 0x3f);
    p[2] = 0x80 | (ucs & 0x3f);
    return 3;
  } else if (ucs <= 0x1fffff) {
    p[0] = 0xf0 | (ucs >> 18);
    p[1] = 0x80 | ((ucs >> 12) & 0x3f);
    p[2] = 0x80 | ((ucs >> 6) & 0x3f);
    p[3] = 0x80 | (ucs & 0x3f);
    return 4;
  } else if (ucs <= 0x3ffffff) {
    p[0] = 0xf8 | (ucs >> 24);
    p[1] = 0x80 | ((ucs >> 18) & 0x3f);
    p[2] = 0x80 | ((ucs >> 12) & 0x3f);
    p[3] = 0x80 | ((ucs >> 6) & 0x3f);
    p[4] = 0x80 | (ucs & 0x3f);
    return 5;
  } else if (ucs <= 0x7ffffff) {
    p[0] = 0xf8 | (ucs >> 30);
    p[1] = 0x80 | ((ucs >> 24) & 0x3f);
    p[2] = 0x80 | ((ucs >> 18) & 0x3f);
    p[3] = 0x80 | ((ucs >> 12) & 0x3f);
    p[4] = 0x80 | ((ucs >> 6) & 0x3f);
    p[5] = 0x80 | (ucs & 0x3f);
    return 6;
  } else {
    p[0] = 0xfe;
    p[1] = 0x80 | ((ucs >> 30) & 0x3f);
    p[2] = 0x80 | ((ucs >> 24) & 0x3f);
    p[3] = 0x80 | ((ucs >> 18) & 0x3f);
    p[4] = 0x80 | ((ucs >> 12) & 0x3f);
    p[5] = 0x80 | ((ucs >> 6) & 0x3f);
    p[6] = 0x80 | (ucs & 0x3f);
    return 7;
  }
}
RT_OFFLOAD_API_GROUP_END

} // namespace Fortran::runtime