godot/thirdparty/pcre2/src/pcre2_jit_simd_inc.h

/*************************************************
*      Perl-Compatible Regular Expressions       *
*************************************************/

/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
                    This module by Zoltan Herczeg
     Original API code Copyright (c) 1997-2012 University of Cambridge
          New API code Copyright (c) 2016-2019 University of Cambridge

-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/

#if !(defined SUPPORT_VALGRIND)

#if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
     || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
     || (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))

vector_compare_type;

#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
{}
#else /* !SLJIT_CONFIG_X86 */
static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
return 15;
#elif PCRE2_CODE_UNIT_WIDTH == 16
return 7;
#elif PCRE2_CODE_UNIT_WIDTH == 32
return 3;
#else
#error "Unsupported unit width"
#endif
}
#endif /* SLJIT_CONFIG_X86 */

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
{}
#endif

#endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */

#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)

static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
{}

static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
  sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
{}

#define JIT_HAS_FAST_FORWARD_CHAR_SIMD

static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
{}

#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD

static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
{}

#ifndef _WIN64

#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD

static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
  PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
{}

#endif /* !_WIN64 */

#undef SIMD_COMPARE_TYPE_INDEX

#endif /* SLJIT_CONFIG_X86 */

#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))

#include <arm_neon.h>

typedef union {
  unsigned int x;
  struct { unsigned char c1, c2, c3, c4; } c;
} int_char;

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
return (*s & 0xc0) == 0x80;
#elif PCRE2_CODE_UNIT_WIDTH == 16
return (*s & 0xfc00) == 0xdc00;
#else
#error "Unknown code width"
#endif
}
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */

#if PCRE2_CODE_UNIT_WIDTH == 8
#define VECTOR_FACTOR
#define vect_t
#define VLD1Q
#define VCEQQ
#define VORRQ
#define VST1Q
#define VDUPQ
#define VEXTQ
#define VANDQ
typedef union {
       uint8_t mem[16];
       uint64_t dw[2];
} quad_word;
#elif PCRE2_CODE_UNIT_WIDTH == 16
#define VECTOR_FACTOR
#define vect_t
#define VLD1Q
#define VCEQQ
#define VORRQ
#define VST1Q
#define VDUPQ
#define VEXTQ
#define VANDQ
typedef union {
       uint16_t mem[8];
       uint64_t dw[2];
} quad_word;
#else
#define VECTOR_FACTOR
#define vect_t
#define VLD1Q
#define VCEQQ
#define VORRQ
#define VST1Q
#define VDUPQ
#define VEXTQ
#define VANDQ
typedef union {
       uint32_t mem[4];
       uint64_t dw[2];
} quad_word;
#endif

#define FFCS
#include "pcre2_jit_neon_inc.h"
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
#define FF_UTF
# include "pcre2_jit_neon_inc.h"
# undef FF_UTF
#endif
#undef FFCS

#define FFCS_2
#include "pcre2_jit_neon_inc.h"
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
#define FF_UTF
# include "pcre2_jit_neon_inc.h"
# undef FF_UTF
#endif
#undef FFCS_2

#define FFCS_MASK
#include "pcre2_jit_neon_inc.h"
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
#define FF_UTF
# include "pcre2_jit_neon_inc.h"
# undef FF_UTF
#endif
#undef FFCS_MASK

#define JIT_HAS_FAST_FORWARD_CHAR_SIMD

static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
{
DEFINE_COMPILER;
int_char ic;
struct sljit_jump *partial_quit, *quit;
/* Save temporary registers. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);

/* Prepare function arguments */
OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);

if (char1 == char2)
  {
    ic.c.c1 = char1;
    ic.c.c2 = char2;
    OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
  if (common->utf && offset > 0)
    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
  else
    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
#else
  sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                   SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
#endif
  }
else
  {
  PCRE2_UCHAR mask = char1 ^ char2;
  if (is_powerof2(mask))
    {
    ic.c.c1 = char1 | mask;
    ic.c.c2 = mask;
    OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf && offset > 0)
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
    else
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
#else
    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
#endif
    }
  else
    {
      ic.c.c1 = char1;
      ic.c.c2 = char2;
      OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf && offset > 0)
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
    else
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
#else
    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
#endif
    }
  }
/* Restore registers. */
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);

/* Check return value. */
partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
if (common->mode == PCRE2_JIT_COMPLETE)
  add_jump(compiler, &common->failed_match, partial_quit);

/* Fast forward STR_PTR to the result of memchr. */
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
if (common->mode != PCRE2_JIT_COMPLETE)
  {
  quit = CMP(SLJIT_NOT_ZERO, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
  JUMPHERE(partial_quit);
  OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
  SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
  JUMPHERE(quit);
  }
}

typedef enum {
  compare_match1,
  compare_match1i,
  compare_match2,
} compare_type;

static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
{
if (ctype == compare_match2)
  {
  vect_t tmp = dst;
  dst = VCEQQ(dst, cmp1);
  tmp = VCEQQ(tmp, cmp2);
  dst = VORRQ(dst, tmp);
  return dst;
  }

if (ctype == compare_match1i)
  dst = VORRQ(dst, cmp2);
dst = VCEQQ(dst, cmp1);
return dst;
}

static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
return 15;
#elif PCRE2_CODE_UNIT_WIDTH == 16
return 7;
#elif PCRE2_CODE_UNIT_WIDTH == 32
return 3;
#else
#error "Unsupported unit width"
#endif
}

/* ARM doesn't have a shift left across lanes. */
static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
{
vect_t zero = VDUPQ(0);
SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
/* VEXTQ takes an immediate as last argument. */
#define C
switch (n)
  {
  C(1); C(2); C(3);
#if PCRE2_CODE_UNIT_WIDTH != 32
  C(4); C(5); C(6); C(7);
# if PCRE2_CODE_UNIT_WIDTH != 16
  C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
# endif
#endif
  default:
    /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
       happen. The return is still here for compilers to not warn. */
    return a;
  }
}

#define FFCPS
#define FFCPS_DIFF1
#define FFCPS_CHAR1A2A

#define FFCPS_0
#include "pcre2_jit_neon_inc.h"
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
#define FF_UTF
# include "pcre2_jit_neon_inc.h"
# undef FF_UTF
#endif
#undef FFCPS_0

#undef FFCPS_CHAR1A2A

#define FFCPS_1
#include "pcre2_jit_neon_inc.h"
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
#define FF_UTF
# include "pcre2_jit_neon_inc.h"
# undef FF_UTF
#endif
#undef FFCPS_1

#undef FFCPS_DIFF1

#define FFCPS_DEFAULT
#include "pcre2_jit_neon_inc.h"
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
#define FF_UTF
# include "pcre2_jit_neon_inc.h"
# undef FF_UTF
#endif
#undef FFCPS

#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD

static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
  PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
{
DEFINE_COMPILER;
sljit_u32 diff = IN_UCHARS(offs1 - offs2);
struct sljit_jump *partial_quit;
int_char ic;
SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
SLJIT_ASSERT(compiler->scratches == 5);

/* Save temporary register STR_PTR. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);

/* Prepare arguments for the function call. */
if (common->match_end_ptr == 0)
   OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
else
  {
  OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
  OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));

  OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
  SELECT(SLJIT_LESS, SLJIT_R0, STR_END, 0, SLJIT_R0);
  }

GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
ic.c.c1 = char1a;
ic.c.c2 = char1b;
ic.c.c3 = char2a;
ic.c.c4 = char2b;
OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);

if (diff == 1) {
  if (char1a == char1b && char2a == char2b) {
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf)
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
    else
#endif
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
  } else {
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
    if (common->utf)
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
    else
#endif
      sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                       SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
  }
} else {
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
  if (common->utf)
    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
  else
#endif
    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
                     SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
}

/* Restore STR_PTR register. */
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);

/* Check return value. */
partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
add_jump(compiler, &common->failed_match, partial_quit);

/* Fast forward STR_PTR to the result of memchr. */
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);

JUMPHERE(partial_quit);
}

#endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */

#if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)

#if PCRE2_CODE_UNIT_WIDTH == 8
#define VECTOR_ELEMENT_SIZE
#elif PCRE2_CODE_UNIT_WIDTH == 16
#define VECTOR_ELEMENT_SIZE
#elif PCRE2_CODE_UNIT_WIDTH == 32
#define VECTOR_ELEMENT_SIZE
#else
#error "Unsupported unit width"
#endif

static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
  sljit_s32 base_reg, sljit_s32 index_reg)
{
sljit_u16 instruction[3];

instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
instruction[1] = (sljit_u16)(base_reg << 12);
instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));

sljit_emit_op_custom(compiler, instruction, 6);
}

#if PCRE2_CODE_UNIT_WIDTH == 32

static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
  PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
{
sljit_u16 instruction[3];

SLJIT_ASSERT(step >= 0 && step <= 1);

if (chr < 0x7fff)
  {
  if (step == 1)
    return;

  /* VREPI */
  instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
  instruction[1] = (sljit_u16)chr;
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
  sljit_emit_op_custom(compiler, instruction, 6);
  return;
  }

if (step == 0)
  {
  OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);

  /* VLVG */
  instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
  instruction[1] = 0;
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
  sljit_emit_op_custom(compiler, instruction, 6);
  return;
  }

/* VREP */
instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
instruction[1] = 0;
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
sljit_emit_op_custom(compiler, instruction, 6);
}

#endif

static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
  int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
{
sljit_u16 instruction[3];

SLJIT_ASSERT(step >= 0 && step <= 2);

if (step == 1)
  {
  /* VCEQ */
  instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
  instruction[1] = (sljit_u16)(cmp1_ind << 12);
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
  sljit_emit_op_custom(compiler, instruction, 6);
  return;
  }

if (compare_type != vector_compare_match2)
  {
  if (step == 0 && compare_type == vector_compare_match1i)
    {
    /* VO */
    instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
    instruction[1] = (sljit_u16)(cmp2_ind << 12);
    instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
    sljit_emit_op_custom(compiler, instruction, 6);
    }
  return;
  }

switch (step)
  {
  case 0:
  /* VCEQ */
  instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
  instruction[1] = (sljit_u16)(cmp2_ind << 12);
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
  sljit_emit_op_custom(compiler, instruction, 6);
  return;

  case 2:
  /* VO */
  instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
  instruction[1] = (sljit_u16)(tmp_ind << 12);
  instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
  sljit_emit_op_custom(compiler, instruction, 6);
  return;
  }
}

#define JIT_HAS_FAST_FORWARD_CHAR_SIMD

static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
{
DEFINE_COMPILER;
sljit_u16 instruction[3];
struct sljit_label *start;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_label *restart;
#endif
struct sljit_jump *quit;
struct sljit_jump *partial_quit[2];
vector_compare_type compare_type = vector_compare_match1;
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
sljit_s32 data_ind = 0;
sljit_s32 tmp_ind = 1;
sljit_s32 cmp1_ind = 2;
sljit_s32 cmp2_ind = 3;
sljit_s32 zero_ind = 4;
sljit_u32 bit = 0;
int i;

SLJIT_UNUSED_ARG(offset);

if (char1 != char2)
  {
  bit = char1 ^ char2;
  compare_type = vector_compare_match1i;

  if (!is_powerof2(bit))
    {
    bit = 0;
    compare_type = vector_compare_match2;
    }
  }

partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->mode == PCRE2_JIT_COMPLETE)
  add_jump(compiler, &common->failed_match, partial_quit[0]);

/* First part (unaligned start) */

OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);

#if PCRE2_CODE_UNIT_WIDTH != 32

/* VREPI */
instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
instruction[1] = (sljit_u16)(char1 | bit);
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
sljit_emit_op_custom(compiler, instruction, 6);

if (char1 != char2)
  {
  /* VREPI */
  instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
  instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
  /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
  sljit_emit_op_custom(compiler, instruction, 6);
  }

#else /* PCRE2_CODE_UNIT_WIDTH == 32 */

for (int i = 0; i < 2; i++)
  {
  replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);

  if (char1 != char2)
    replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
  }

#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */

if (compare_type == vector_compare_match2)
  {
  /* VREPI */
  instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
  instruction[1] = 0;
  instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
  sljit_emit_op_custom(compiler, instruction, 6);
  }

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
restart = LABEL();
#endif

load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);

if (compare_type != vector_compare_match2)
  {
  if (compare_type == vector_compare_match1i)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFEE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
  sljit_emit_op_custom(compiler, instruction, 6);
  }
else
  {
  for (i = 0; i < 3; i++)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFENE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
  sljit_emit_op_custom(compiler, instruction, 6);
  }

/* VLGVB */
instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
instruction[1] = 7;
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
sljit_emit_op_custom(compiler, instruction, 6);

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);

OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);

/* Second part (aligned) */
start = LABEL();

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);

partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->mode == PCRE2_JIT_COMPLETE)
  add_jump(compiler, &common->failed_match, partial_quit[1]);

load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);

if (compare_type != vector_compare_match2)
  {
  if (compare_type == vector_compare_match1i)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFEE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
  sljit_emit_op_custom(compiler, instruction, 6);
  }
else
  {
  for (i = 0; i < 3; i++)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFENE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
  sljit_emit_op_custom(compiler, instruction, 6);
  }

sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
JUMPTO(SLJIT_OVERFLOW, start);

/* VLGVB */
instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
instruction[1] = 7;
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
sljit_emit_op_custom(compiler, instruction, 6);

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);

JUMPHERE(quit);

if (common->mode != PCRE2_JIT_COMPLETE)
  {
  JUMPHERE(partial_quit[0]);
  JUMPHERE(partial_quit[1]);
  OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
  SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
  }
else
  add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf && offset > 0)
  {
  SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);

  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));

  quit = jump_if_utf_char_start(compiler, TMP1);

  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
  add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

  OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
  JUMPTO(SLJIT_JUMP, restart);

  JUMPHERE(quit);
  }
#endif
}

#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD

static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
{
DEFINE_COMPILER;
sljit_u16 instruction[3];
struct sljit_label *start;
struct sljit_jump *quit;
jump_list *not_found = NULL;
vector_compare_type compare_type = vector_compare_match1;
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);
sljit_s32 data_ind = 0;
sljit_s32 tmp_ind = 1;
sljit_s32 cmp1_ind = 2;
sljit_s32 cmp2_ind = 3;
sljit_s32 zero_ind = 4;
sljit_u32 bit = 0;
int i;

if (char1 != char2)
  {
  bit = char1 ^ char2;
  compare_type = vector_compare_match1i;

  if (!is_powerof2(bit))
    {
    bit = 0;
    compare_type = vector_compare_match2;
    }
  }

add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));

/* First part (unaligned start) */

OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);

#if PCRE2_CODE_UNIT_WIDTH != 32

/* VREPI */
instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
instruction[1] = (sljit_u16)(char1 | bit);
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
sljit_emit_op_custom(compiler, instruction, 6);

if (char1 != char2)
  {
  /* VREPI */
  instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
  instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
  /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
  sljit_emit_op_custom(compiler, instruction, 6);
  }

#else /* PCRE2_CODE_UNIT_WIDTH == 32 */

for (int i = 0; i < 2; i++)
  {
  replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);

  if (char1 != char2)
    replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
  }

#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */

if (compare_type == vector_compare_match2)
  {
  /* VREPI */
  instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
  instruction[1] = 0;
  instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
  sljit_emit_op_custom(compiler, instruction, 6);
  }

load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);

if (compare_type != vector_compare_match2)
  {
  if (compare_type == vector_compare_match1i)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFEE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
  sljit_emit_op_custom(compiler, instruction, 6);
  }
else
  {
  for (i = 0; i < 3; i++)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFENE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
  sljit_emit_op_custom(compiler, instruction, 6);
  }

/* VLGVB */
instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
instruction[1] = 7;
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
sljit_emit_op_custom(compiler, instruction, 6);

OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);

OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);

/* Second part (aligned) */
start = LABEL();

OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);

add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));

load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);

if (compare_type != vector_compare_match2)
  {
  if (compare_type == vector_compare_match1i)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFEE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
  sljit_emit_op_custom(compiler, instruction, 6);
  }
else
  {
  for (i = 0; i < 3; i++)
    fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

  /* VFENE */
  instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
  instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
  instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
  sljit_emit_op_custom(compiler, instruction, 6);
  }

sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
JUMPTO(SLJIT_OVERFLOW, start);

/* VLGVB */
instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
instruction[1] = 7;
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
sljit_emit_op_custom(compiler, instruction, 6);

OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);

JUMPHERE(quit);
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));

return not_found;
}

#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD

static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
  PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
{
DEFINE_COMPILER;
sljit_u16 instruction[3];
struct sljit_label *start;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_label *restart;
#endif
struct sljit_jump *quit;
struct sljit_jump *jump[2];
vector_compare_type compare1_type = vector_compare_match1;
vector_compare_type compare2_type = vector_compare_match1;
sljit_u32 bit1 = 0;
sljit_u32 bit2 = 0;
sljit_s32 diff = IN_UCHARS(offs2 - offs1);
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
sljit_s32 data1_ind = 0;
sljit_s32 data2_ind = 1;
sljit_s32 tmp1_ind = 2;
sljit_s32 tmp2_ind = 3;
sljit_s32 cmp1a_ind = 4;
sljit_s32 cmp1b_ind = 5;
sljit_s32 cmp2a_ind = 6;
sljit_s32 cmp2b_ind = 7;
sljit_s32 zero_ind = 8;
int i;

SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);

if (char1a != char1b)
  {
  bit1 = char1a ^ char1b;
  compare1_type = vector_compare_match1i;

  if (!is_powerof2(bit1))
    {
    bit1 = 0;
    compare1_type = vector_compare_match2;
    }
  }

if (char2a != char2b)
  {
  bit2 = char2a ^ char2b;
  compare2_type = vector_compare_match1i;

  if (!is_powerof2(bit2))
    {
    bit2 = 0;
    compare2_type = vector_compare_match2;
    }
  }

/* Initialize. */
if (common->match_end_ptr != 0)
  {
  OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
  OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
  OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));

  OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
  SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
  }

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);

#if PCRE2_CODE_UNIT_WIDTH != 32

OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);

/* VREPI */
instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
instruction[1] = (sljit_u16)(char1a | bit1);
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
sljit_emit_op_custom(compiler, instruction, 6);

if (char1a != char1b)
  {
  /* VREPI */
  instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
  instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
  /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
  sljit_emit_op_custom(compiler, instruction, 6);
  }

/* VREPI */
instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
instruction[1] = (sljit_u16)(char2a | bit2);
/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
sljit_emit_op_custom(compiler, instruction, 6);

if (char2a != char2b)
  {
  /* VREPI */
  instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
  instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
  /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
  sljit_emit_op_custom(compiler, instruction, 6);
  }

#else /* PCRE2_CODE_UNIT_WIDTH == 32 */

for (int i = 0; i < 2; i++)
  {
  replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);

  if (char1a != char1b)
    replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);

  replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);

  if (char2a != char2b)
    replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
  }

OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);

#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */

/* VREPI */
instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
instruction[1] = 0;
instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
sljit_emit_op_custom(compiler, instruction, 6);

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
restart = LABEL();
#endif

jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
jump[1] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[0]);
load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
JUMPHERE(jump[1]);

load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);

for (i = 0; i < 3; i++)
  {
  fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
  fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
  }

/* VN */
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
instruction[1] = (sljit_u16)(data2_ind << 12);
instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
sljit_emit_op_custom(compiler, instruction, 6);

/* VFENE */
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
sljit_emit_op_custom(compiler, instruction, 6);

/* VLGVB */
instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
instruction[1] = 7;
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
sljit_emit_op_custom(compiler, instruction, 6);

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);

OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);

/* Main loop. */
start = LABEL();

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);

for (i = 0; i < 3; i++)
  {
  fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
  fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
  }

/* VN */
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
instruction[1] = (sljit_u16)(data2_ind << 12);
instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
sljit_emit_op_custom(compiler, instruction, 6);

/* VFENE */
instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
sljit_emit_op_custom(compiler, instruction, 6);

sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
JUMPTO(SLJIT_OVERFLOW, start);

/* VLGVB */
instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
instruction[1] = 7;
instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
sljit_emit_op_custom(compiler, instruction, 6);

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

JUMPHERE(quit);

add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
  {
  SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);

  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));

  quit = jump_if_utf_char_start(compiler, TMP1);

  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
  add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

  /* TMP1 contains diff. */
  OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
  OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
  JUMPTO(SLJIT_JUMP, restart);

  JUMPHERE(quit);
  }
#endif

OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));

if (common->match_end_ptr != 0)
  OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
}

#endif /* SLJIT_CONFIG_S390X */

#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)

#ifdef __linux__
/* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */
#include <sys/auxv.h>
#define LOONGARCH_HWCAP_LSX
#define HAS_LSX_SUPPORT
#else
#define HAS_LSX_SUPPORT
#endif

typedef sljit_ins sljit_u32;

#define SI12_IMM_MASK
#define UI5_IMM_MASK
#define UI2_IMM_MASK

#define VD
#define VJ
#define VK
#define RD_V
#define RJ_V

#define IMM_SI12
#define IMM_UI5
#define IMM_UI2

// LSX OPCODES:
#define VLD
#define VOR_V
#define VAND_V
#define VBSLL_V
#define VMSKLTZ_B
#define VPICKVE2GR_WU

#if PCRE2_CODE_UNIT_WIDTH == 8
#define VREPLGR2VR
#define VSEQ
#elif PCRE2_CODE_UNIT_WIDTH == 16
#define VREPLGR2VR
#define VSEQ
#else
#define VREPLGR2VR
#define VSEQ
#endif

static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
  sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
{
if (compare_type != vector_compare_match2)
  {
  if (compare_type == vector_compare_match1i)
    {
    /* VOR.V vd, vj, vk */
    push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));
    }

  /* VSEQ.B/H/W vd, vj, vk */
  push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
  return;
  }

/* VBSLL.V vd, vj, ui5 */
push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));

/* VSEQ.B/H/W vd, vj, vk */
push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));

/* VSEQ.B/H/W vd, vj, vk */
push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));

/* VOR vd, vj, vk */
push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));
return;
}

#define JIT_HAS_FAST_FORWARD_CHAR_SIMD

static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
{
DEFINE_COMPILER;
struct sljit_label *start;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_label *restart;
#endif
struct sljit_jump *quit;
struct sljit_jump *partial_quit[2];
vector_compare_type compare_type = vector_compare_match1;
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
sljit_s32 data_ind = 0;
sljit_s32 tmp_ind = 1;
sljit_s32 cmp1_ind = 2;
sljit_s32 cmp2_ind = 3;
sljit_u32 bit = 0;

SLJIT_UNUSED_ARG(offset);

if (char1 != char2)
  {
  bit = char1 ^ char2;
  compare_type = vector_compare_match1i;

  if (!is_powerof2(bit))
    {
    bit = 0;
    compare_type = vector_compare_match2;
    }
  }

partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->mode == PCRE2_JIT_COMPLETE)
  add_jump(compiler, &common->failed_match, partial_quit[0]);

/* First part (unaligned start) */

OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);

/* VREPLGR2VR.B/H/W vd, rj */
push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));

if (char1 != char2)
  {
  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);

  /* VREPLGR2VR.B/H/W vd, rj */
  push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
  }

OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
restart = LABEL();
#endif

OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

/* VLD vd, rj, si12 */
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

/* VMSKLTZ.B vd, vj */
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));

/* VPICKVE2GR.WU rd, vj, ui2 */
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);

quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);

OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

/* Second part (aligned) */
start = LABEL();

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);

partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->mode == PCRE2_JIT_COMPLETE)
  add_jump(compiler, &common->failed_match, partial_quit[1]);

/* VLD vd, rj, si12 */
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

/* VMSKLTZ.B vd, vj */
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));

/* VPICKVE2GR.WU rd, vj, ui2 */
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));

CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);

JUMPHERE(quit);

/* CTZ.W rd, rj */
push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);

if (common->mode != PCRE2_JIT_COMPLETE)
  {
  JUMPHERE(partial_quit[0]);
  JUMPHERE(partial_quit[1]);
  OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
  SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
  }
else
  add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf && offset > 0)
  {
  SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);

  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));

  quit = jump_if_utf_char_start(compiler, TMP1);

  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
  add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
  OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
  JUMPTO(SLJIT_JUMP, restart);

  JUMPHERE(quit);
  }
#endif
}

#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD

static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
{
DEFINE_COMPILER;
struct sljit_label *start;
struct sljit_jump *quit;
jump_list *not_found = NULL;
vector_compare_type compare_type = vector_compare_match1;
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
sljit_s32 data_ind = 0;
sljit_s32 tmp_ind = 1;
sljit_s32 cmp1_ind = 2;
sljit_s32 cmp2_ind = 3;
sljit_u32 bit = 0;

if (char1 != char2)
  {
  bit = char1 ^ char2;
  compare_type = vector_compare_match1i;

  if (!is_powerof2(bit))
    {
    bit = 0;
    compare_type = vector_compare_match2;
    }
  }

add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);

/* First part (unaligned start) */

OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);

/* VREPLGR2VR vd, rj */
push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));

if (char1 != char2)
  {
  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
  /* VREPLGR2VR vd, rj */
  push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
  }

OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

/* VLD vd, rj, si12 */
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

/* VMSKLTZ.B vd, vj */
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));

/* VPICKVE2GR.WU rd, vj, ui2 */
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);

quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);

OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

/* Second part (aligned) */
start = LABEL();

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);

add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

/* VLD vd, rj, si12 */
push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

/* VMSKLTZ.B vd, vj */
push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));

/* VPICKVE2GR.WU rd, vj, ui2 */
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));

CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);

JUMPHERE(quit);

/* CTZ.W rd, rj */
push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));

OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));

OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
return not_found;
}

#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD

static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
  PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
{
DEFINE_COMPILER;
vector_compare_type compare1_type = vector_compare_match1;
vector_compare_type compare2_type = vector_compare_match1;
sljit_u32 bit1 = 0;
sljit_u32 bit2 = 0;
sljit_u32 diff = IN_UCHARS(offs1 - offs2);
sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
sljit_s32 data1_ind = 0;
sljit_s32 data2_ind = 1;
sljit_s32 tmp1_ind = 2;
sljit_s32 tmp2_ind = 3;
sljit_s32 cmp1a_ind = 4;
sljit_s32 cmp1b_ind = 5;
sljit_s32 cmp2a_ind = 6;
sljit_s32 cmp2b_ind = 7;
struct sljit_label *start;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_label *restart;
#endif
struct sljit_jump *jump[2];

SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));

/* Initialize. */
if (common->match_end_ptr != 0)
  {
  OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
  OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
  OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);

  OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
  SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
  }

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

if (char1a == char1b)
  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
else
  {
  bit1 = char1a ^ char1b;
  if (is_powerof2(bit1))
    {
    compare1_type = vector_compare_match1i;
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
    OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);
    }
  else
    {
    compare1_type = vector_compare_match2;
    bit1 = 0;
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
    OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);
    }
  }

/* VREPLGR2VR vd, rj */
push_inst(compiler, VREPLGR2VR | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));

if (char1a != char1b)
  {
  /* VREPLGR2VR vd, rj */
  push_inst(compiler, VREPLGR2VR | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));
  }

if (char2a == char2b)
  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
else
  {
  bit2 = char2a ^ char2b;
  if (is_powerof2(bit2))
    {
    compare2_type = vector_compare_match1i;
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);
    OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);
    }
  else
    {
    compare2_type = vector_compare_match2;
    bit2 = 0;
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
    OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);
    }
  }

/* VREPLGR2VR vd, rj */
push_inst(compiler, VREPLGR2VR | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));

if (char2a != char2b)
  {
  /* VREPLGR2VR vd, rj */
  push_inst(compiler, VREPLGR2VR | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));
  }

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
restart = LABEL();
#endif

OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

/* VLD vd, rj, si12 */
push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));

jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);

/* VLD vd, rj, si12 */
push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
jump[1] = JUMP(SLJIT_JUMP);

JUMPHERE(jump[0]);

/* VBSLL.V vd, vj, ui5 */
push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));

JUMPHERE(jump[1]);

fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);

/* VAND vd, vj, vk */
push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));

/* VMSKLTZ.B vd, vj */
push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));

/* VPICKVE2GR.WU rd, vj, ui2 */
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));

/* Ignore matches before the first STR_PTR. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);

jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);

OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

/* Main loop. */
start = LABEL();

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

/* VLD vd, rj, si12 */
push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));

fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);

/* VAND.V vd, vj, vk */
push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));

/* VMSKLTZ.B vd, vj */
push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));

/* VPICKVE2GR.WU rd, vj, ui2 */
push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));

CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);

JUMPHERE(jump[0]);

/* CTZ.W rd, rj */
push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);

add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
  {
  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));

  jump[0] = jump_if_utf_char_start(compiler, TMP1);

  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
  CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);

  add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));

  JUMPHERE(jump[0]);
  }
#endif

OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));

if (common->match_end_ptr != 0)
  OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
}

#endif /* SLJIT_CONFIG_LOONGARCH_64 */

#endif /* !SUPPORT_VALGRIND */