/*  Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide.

Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */

#include <stdint.h>
#include <string.h>

/* This is xoroshiro1024* 1.0, our large-state generator for
   floating-point numbers. We suggest to use its upper bits for
   floating-point generation, as it is slightly faster than
   xoroshiro1024++/xoroshiro1024**.  Its state however is too large--in
   general, the xoshiro256 family should be preferred. It is
   a better replacement for xorshift1024*.

   It passes all tests we are aware of except for the lowest three bits,
   which might fail linearity tests (and just those), so if low linear
   complexity is not considered an issue (as it is usually the case) it
   can be used to generate 64-bit outputs, too.

   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */


static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static int p;
static uint64_t s[16];

uint64_t next(void) {
	const int q = p;
	const uint64_t s0 = s[p = (p + 1) & 15];
	uint64_t s15 = s[q];
	const uint64_t result = s0 * 0x9e3779b97f4a7c13;

	s15 ^= s0;
	s[q] = rotl(s0, 25) ^ s15 ^ (s15 << 27);
	s[p] = rotl(s15, 36);

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^512 calls to next(); it can be used to generate 2^512
   non-overlapping subsequences for parallel computations. */

void jump() {
	static const uint64_t JUMP[] = { 0x931197d8e3177f17,
		0xb59422e0b9138c5f, 0xf06a6afb49d668bb, 0xacb8a6412c8a1401,
		0x12304ec85f0b3468, 0xb7dfe7079209891e, 0x405b7eec77d9eb14,
		0x34ead68280c44e4a, 0xe0e4ba3e0ac9e366, 0x8f46eda8348905b7,
		0x328bf4dbad90d6ff, 0xc8fd6fb31c9effc3, 0xe899d452d4b67652,
		0x45f387286ade3205, 0x03864f454a8920bd, 0xa68fa28725b1b384 };

	uint64_t t[sizeof s / sizeof *s];
	memset(t, 0, sizeof t);
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b)
				for(int j = 0; j < sizeof s / sizeof *s; j++)
					t[j] ^= s[(j + p) & sizeof s / sizeof *s - 1];
			next();
		}

	for(int i = 0; i < sizeof s / sizeof *s; i++) {
		s[(i + p) & sizeof s / sizeof *s - 1] = t[i];
	}
}

/* This is the long-jump function for the generator. It is equivalent to
   2^768 calls to next(); it can be used to generate 2^256 starting points,
   from each of which jump() will generate 2^256 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x7374156360bbf00f,
		0x4630c2efa3b3c1f6, 0x6654183a892786b1, 0x94f7bfcbfb0f1661,
		0x27d8243d3d13eb2d, 0x9701730f3dfb300f, 0x2f293baae6f604ad,
		0xa661831cb60cd8b6, 0x68280c77d9fe008c, 0x50554160f5ba9459,
		0x2fc20b17ec7b2a9a, 0x49189bbdc8ec9f8f, 0x92a65bca41852cc1,
		0xf46820dd0509c12a, 0x52b00c35fbf92185, 0x1e5b3b7f589e03c1 };

	uint64_t t[sizeof s / sizeof *s];
	memset(t, 0, sizeof t);
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b)
				for(int j = 0; j < sizeof s / sizeof *s; j++)
					t[j] ^= s[(j + p) & sizeof s / sizeof *s - 1];
			next();
		}

	for(int i = 0; i < sizeof s / sizeof *s; i++) {
		s[(i + p) & sizeof s / sizeof *s - 1] = t[i];
	}
}


/* The following arbitrary-jump function uses a minimal library to compute at
   run time the jump polynomial x^(c * 2^e) mod p(x), where p(x) is the
   characteristic polynomial of the generator; the polynomial is then applied
   to the state with the same accumulate-and-step loop used by jump(). */

#define POLY_DEG 1024
static const uint64_t charpoly[] = {
	0x5cfeb8cc48ddb211, 0xb73e379d035a06dd, 0x17d5100a20a0350e, 0x7550223f68f98cac,
	0x29d373b5c5ed3459, 0x3689b412ef70de48, 0xa1d3b6ee079a7cc6, 0x9bf0b669abd100f8,
	0x955c84e105f60997, 0x6ca140c61889cddd, 0xabaf68c5fc3a0e4a, 0xa46134526b83adc5,
	0x0710704d05683d63, 0x580d080b44b606a2, 0x008040a0580158a1, 0x0000000000800081 };
#include "f2x.c"

/* Applies the precomputed jump polynomial poly (= x^n mod charpoly for the
   desired distance n) to the state, using the same accumulate-and-step loop
   as jump(). Shared by jump_ce() and jump_n(). */

static void jump_apply(const uint64_t *const poly) {
	uint64_t t[sizeof s / sizeof *s];
	memset(t, 0, sizeof t);
	for(int i = 0; i < POLY_WORDS; i++)
		for(int b = 0; b < 64; b++) {
			if (poly[i] & UINT64_C(1) << b)
				for(int j = 0; j < sizeof s / sizeof *s; j++)
					t[j] ^= s[(j + p) & sizeof s / sizeof *s - 1];
			next();
		}

	for(int i = 0; i < sizeof s / sizeof *s; i++)
		s[(i + p) & sizeof s / sizeof *s - 1] = t[i];
}

/* This is the arbitrary-jump function for the generator expressed as c * 2^e.
   It is equivalent to c * 2^e calls to next(); for example, jump_ce(1, 512) is
   equivalent to jump() and jump_ce(1, 768) is equivalent to long_jump().
   Expressing the distance as c * 2^e makes it possible to request both ordinary
   counts (jump_ce(k, 0)) and multiples of power-of-two jumps without handling
   multiple-precision integers. For the jump to be meaningful, c * 2^e should be
   smaller than the period (2^1024 - 1). */

void jump_ce(uint64_t c, uint32_t e) {
	uint64_t poly[POLY_WORDS];
	f2x_jumppoly_ce(c, e, poly);
	jump_apply(poly);
}

/* This is the general arbitrary-jump function for the generator. It is
   equivalent to n calls to next(), where n = jump[0] + jump[1] * 2^64 + ... is
   the little-endian integer held in the POLY_WORDS words of jump. Unlike
   jump_ce(), it can express any jump distance. For the jump to be meaningful, n
   should be smaller than the period (2^1024 - 1). */

void jump_n(const uint64_t jump[POLY_WORDS]) {
	uint64_t poly[POLY_WORDS];
	f2x_jumppoly_n(jump, POLY_WORDS, poly);
	jump_apply(poly);
}