llvm/libc/AOR_v20.02/networking/test/chksum.c

/*
 * Ones' complement checksum test & benchmark
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 */

#define _GNU_SOURCE
#include <inttypes.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <time.h>
#include <unistd.h>
#include "../include/networking.h"

#if WANT_ASSERT
#undef NDEBUG
#include <assert.h>
#define Assert(exp) assert(exp)
#else
#define Assert(exp) (void) (exp)
#endif

#ifdef __GNUC__
#define may_alias __attribute__((__may_alias__))
#else
#define may_alias
#endif

#define CACHE_LINE 64
#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))

/* Reference implementation - do not modify! */
static uint16_t
checksum_simple(const void *ptr, uint32_t nbytes)
{
    const uint16_t *may_alias hptr = ptr;
    uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */

    /* Sum all halfwords, assume misaligned accesses are handled in HW */
    for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
    {
	sum += *hptr++;
    }

    /* Add any trailing odd byte */
    if ((nbytes & 0x01) != 0)
    {
	sum += *(uint8_t *) hptr;
    }

    /* Fold 64-bit sum to 32 bits */
    sum = (sum & 0xffffffff) + (sum >> 32);
    sum = (sum & 0xffffffff) + (sum >> 32);
    Assert(sum == (uint32_t) sum);

    /* Fold 32-bit sum to 16 bits */
    sum = (sum & 0xffff) + (sum >> 16);
    sum = (sum & 0xffff) + (sum >> 16);
    Assert(sum == (uint16_t) sum);

    return (uint16_t) sum;
}

static struct
{
    uint16_t (*cksum_fp)(const void *, uint32_t);
    const char *name;
} implementations[] =
{
    { checksum_simple, "simple"},
    { __chksum, "scalar"},
#if __arm__
    { __chksum_arm_simd, "simd" },
#elif __aarch64__
    { __chksum_aarch64_simd, "simd" },
#endif
    { NULL, NULL}
};

static int
find_impl(const char *name)
{
    for (int i = 0; implementations[i].name != NULL; i++)
    {
	if (strcmp(implementations[i].name, name) == 0)
	{
	    return i;
	}
    }
    return -1;
}

static uint16_t (*CKSUM_FP)(const void *, uint32_t);
static volatile uint16_t SINK;

static bool
verify(const void *data, uint32_t offset, uint32_t size)
{

    uint16_t csum_expected = checksum_simple(data, size);
    uint16_t csum_actual = CKSUM_FP(data, size);
    if (csum_actual != csum_expected)
    {
	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
		"actual %04x expected %04x (valid)",
		offset, size, csum_actual, csum_expected);
	if (size < 65536)
	{
	    /* Fatal error */
	    exit(EXIT_FAILURE);
	}
	/* Else some implementations only support sizes up to 2^16 */
	return false;
    }
    return true;
}

static uint64_t
clock_get_ns(void)
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
}

static void
benchmark(const uint8_t *base,
	  size_t poolsize,
	  uint32_t blksize,
	  uint32_t numops,
	  uint64_t cpufreq)
{
    printf("%11u ", (unsigned int) blksize); fflush(stdout);

    uint64_t start = clock_get_ns();
    for (uint32_t i = 0; i < numops; i ++)
    {
	/* Read a random value from the pool */
	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
	/* Generate a random starting address */
	const void *data = &base[random % (poolsize - blksize)];
	SINK = CKSUM_FP(data, blksize);
    }
    uint64_t end = clock_get_ns();

#define MEGABYTE 1000000 /* Decimal megabyte (MB) */
    uint64_t elapsed_ns = end - start;
    uint64_t elapsed_ms = elapsed_ns / 1000000;
    uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
    uint64_t accbytes = (uint64_t) numops * blksize;
    printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
    unsigned int cyc_per_blk = cpufreq / blks_per_s;
    printf("%11u ", cyc_per_blk);
    if (blksize != 0)
    {
	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
	printf("%7u.%03u ",
		cyc_per_byte / 1000, cyc_per_byte % 1000);
    }
    printf("\n");
}

int main(int argc, char *argv[])
{
    int c;
    bool DUMP = false;
    uint32_t IMPL = 0;/* Simple implementation */
    uint64_t CPUFREQ = 0;
    uint32_t BLKSIZE = 0;
    uint32_t NUMOPS = 1000000;
    uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */

    setvbuf(stdout, NULL, _IOLBF, 160);
    while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
    {
	switch (c)
	{
	    case 'b' :
		{
		    int blksize = atoi(optarg);
		    if (blksize < 1 || blksize > POOLSIZE / 2)
		    {
			fprintf(stderr, "Invalid block size %d\n", blksize);
			exit(EXIT_FAILURE);
		    }
		    BLKSIZE = (unsigned) blksize;
		    break;
		}
	    case 'd' :
		DUMP = true;
		break;
	    case 'f' :
		{
		    int64_t cpufreq = atoll(optarg);
		    if (cpufreq < 1)
		    {
			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
				cpufreq);
			exit(EXIT_FAILURE);
		    }
		    CPUFREQ = cpufreq;
		    break;
		}
	    case 'i' :
		{
		    int impl = find_impl(optarg);
		    if (impl < 0)
		    {
			fprintf(stderr, "Invalid implementation %s\n", optarg);
			goto usage;
		    }
		    IMPL = (unsigned) impl;
		    break;
		}
	    case 'n' :
		{
		    int numops = atoi(optarg);
		    if (numops < 1)
		    {
			fprintf(stderr, "Invalid number of operations %d\n", numops);
			exit(EXIT_FAILURE);
		    }
		    NUMOPS = (unsigned) numops;
		    break;
		}
	    case 'p' :
		{
		    int poolsize = atoi(optarg);
		    if (poolsize < 4096)
		    {
			fprintf(stderr, "Invalid pool size %d\n", poolsize);
			exit(EXIT_FAILURE);
		    }
		    char c = optarg[strlen(optarg) - 1];
		    if (c == 'M')
		    {
			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
		    }
		    else if (c == 'K')
		    {
			POOLSIZE = (unsigned) poolsize * 1024;
		    }
		    else
		    {
			POOLSIZE = (unsigned) poolsize;
		    }
		    break;
		}
	    default :
usage :
		fprintf(stderr, "Usage: checksum <options>\n"
			"-b <blksize>    Block size\n"
			"-d              Dump first 96 bytes of data\n"
			"-f <cpufreq>    CPU frequency (Hz)\n"
			"-i <impl>       Implementation\n"
			"-n <numops>     Number of operations\n"
			"-p <poolsize>   Pool size (K or M suffix)\n"
		       );
		printf("Implementations:");
		for (int i = 0; implementations[i].name != NULL; i++)
		{
		    printf(" %s", implementations[i].name);
		}
		printf("\n");
		exit(EXIT_FAILURE);
	}
    }
    if (optind > argc)
    {
	goto usage;
    }

    CKSUM_FP = implementations[IMPL].cksum_fp;
    POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
    uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
    if (base == MAP_FAILED)
    {
	perror("aligned_alloc"), exit(EXIT_FAILURE);
    }
    for (size_t i = 0; i < POOLSIZE / 4; i++)
    {
	((uint32_t *) base)[i] = rand();
    }

    printf("Implementation: %s\n", implementations[IMPL].name);
    printf("numops %u, poolsize ", NUMOPS);
    if (POOLSIZE % (1024 * 1024) == 0)
    {
	printf("%uMiB", POOLSIZE / (1024 * 1024));
    }
    else if (POOLSIZE % 1024 == 0)
    {
	printf("%uKiB", POOLSIZE / 1024);
    }
    else
    {
	printf("%uB", POOLSIZE);
    }
    printf(", blocksize %u, CPU frequency %juMHz\n",
	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
#if WANT_ASSERT
    printf("Warning: assertions are enabled\n");
#endif

    if (DUMP)
    {
	/* Print out first 96 bytes of data for human debugging */
	for (int i = 0; i < 96; i++)
	{
	    if (i % 8 == 0)
		printf("%2u:", i);
	    printf(" %02x", base[i]);
	    if (i % 8 == 7)
		printf("\n");
	}
    }

    /* Verify that chosen algorithm handles all combinations of offsets and sizes */
    printf("Verifying..."); fflush(stdout);
    bool success = true;
    /* Check all (relevant) combinations of size and offset */
    for (int size = 0; size <= 256; size++)
    {
	for (int offset = 0; offset < 255; offset++)
	{
	    /* Check at start of mapped memory */
	    success &= verify(&base[offset], offset, size);
	    /* Check at end of mapped memory */
	    uint8_t *p = base + POOLSIZE - (size + offset);
	    success &= verify(p, (uintptr_t) p % 64, size);
	}
    }
    /* Check increasingly larger sizes */
    for (size_t size = 1; size < POOLSIZE; size *= 2)
    {
	success &= verify(base, 0, size);
    }
    /* Check the full size, this can detect accumulator overflows */
    success &= verify(base, 0, POOLSIZE);
    printf("%s\n", success ? "OK" : "failure");

    /* Print throughput in decimal megabyte (1000000B) per second */
    if (CPUFREQ != 0)
    {
	printf("%11s %11s %11s %11s\n",
	       "block size", "MB/s", "cycles/blk", "cycles/byte");
    }
    else
    {
	printf("%11s %11s %11s %11s\n",
	       "block size", "MB/s", "ns/blk", "ns/byte");
	CPUFREQ = 1000000000;
    }
    if (BLKSIZE != 0)
    {
	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
    }
    else
    {
	static const uint16_t sizes[] =
	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
	for (int i = 0; sizes[i] != 0; i++)
	{
	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
	}
    }

    if (munmap(base, POOLSIZE) != 0)
    {
	perror("munmap"), exit(EXIT_FAILURE);
    }

    return success ? EXIT_SUCCESS : EXIT_FAILURE;
}