// -*- Mode: C -*-

//
// Use field-element code generated by fiat-crypto to implement ecdsa with curve secp256r1.
//

#include <stddef.h>
#include <stdint.h>
#include <inttypes.h>

// generated with `fiat-crypto/src/ExtractOCaml/word_by_word_montgomery`
//  [see files for details of the invocation]
#include "p256_64.c"
#include "n256_64.c"

// four uint64_t limbs, LITTLE endian.
typedef fiat_p256_montgomery_domain_field_element fe;
// elliptic curve point in homogeneous coordinates. [i.e. (X : Y : Z) form]
typedef fe ecph[3];

#define FE_ADD fiat_p256_add
#define FE_SUB fiat_p256_sub
#define FE_MUL fiat_p256_mul

// these are in the montgomery domain, limbs in little-endian order.
// see irken-compiler/demo/ec/redc.scm
static const fe c_a  = {0xfffffffffffffffc, 0x00000003ffffffff, 0x0000000000000000, 0xfffffffc00000004};
static const fe c_b3 = {0x89d69e267d4e399f, 0x06d01166698c91b2, 0xb0e66203e5638c84, 0x949012590d95d89c}; // b * 3
static const fe c_n  = {0x9ad169483335f568, 0x89b1054463fe9931, 0x9c0166ce652e96b7, 0x98648c200fe30feb}; // n

// the point at infinity, homogeneous coords, montgomery domain.
static const ecph p_inf = {
  {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}, // (0
  {0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe}, // :1
  {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}  // :0)
};

// the generator, homogeneous coords, montgomery domain.
static const ecph p_g   = {
  {0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6}, // g.x
  {0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, 0x8571ff1825885d85}, // g.y
  {0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe}  // 1
};

// copy a field element
static
void
fe_copy (fe out1, const fe arg1)
{
  for (size_t i = 0; i < 4; i++) {
    out1[i] = arg1[i];
  }
}

// copy a curve point
static void
ecph_copy (ecph out1, const ecph arg1)
{
  for (size_t i = 0; i < 3; i++) {
    fe_copy (out1[i], arg1[i]);
  }
}

// from montgomery
static void
p256_fm (fe out, const fe arg)
{
  fe one = {1,0,0,0};
  fiat_p256_mul (out, arg, one);
}

#if 0 // not needed.
// to montgomery
static void
p256_tm (fe out, const fe arg)
{
  // fm(fm(1))
  fe r2n = {0x3, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x4fffffffd};
  fiat_p256_mul (out, arg, r2n);
}
#endif

// from montgomery
static void
n256_fm (fe out, const fe arg)
{
  fe one = {1,0,0,0};
  fiat_n256_mul (out, arg, one);
}

// to montgomery
static void
n256_tm (fe out, const fe arg)
{
  // fm(fm(1))
  fe r2n = {0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620};
  fiat_n256_mul (out, arg, r2n);
}

// input: u8[32], BIG endian
// output: fe, NOT montgomery form
static void
fe_from_bytes (fe out, const uint8_t s[32])
{
  for (size_t i = 0; i < 4; i++) {
    uint64_t limb = 0;
    for (size_t j = 0; j < 8; j++) {
      uint8_t b = s[i * 8 + j];
      limb <<= 8;
      limb |= b;
    }
    out[3-i] = limb;
  }
}

// input: fe, NOT montgomery form, little-endian
// output: u8[32], BIG endian.
static void
fe_to_bytes (uint8_t out[32], const fe arg1)
{
  for (size_t i = 0; i < 4; i++) {
    uint64_t limb = arg1[3-i];
    for (size_t j = 0; j < 8; j++) {
      out[i * 8 + j] = 0xff & (limb >> ((7-j)*8));
    }
  }
}

// 2015 Renes–Costello–Batina "Complete addition formulas for prime order elliptic curves"
// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#doubling-dbl-2015-rcb
static void
dbl_2015_rcb (ecph ro, const ecph ri) // ro = ri + ri
{
  fe t0,t1,t2,t3;
#define X1 ri[0]
#define Y1 ri[1]
#define Z1 ri[2]
#define X3 ro[0]
#define Y3 ro[1]
#define Z3 ro[2]
  FE_MUL (t0, X1, X1);
  FE_MUL (t1, Y1, Y1);
  FE_MUL (t2, Z1, Z1);
  FE_MUL (t3, X1, Y1);
  FE_ADD (t3, t3, t3);
  FE_MUL (Z3, X1, Z1);
  FE_ADD (Z3, Z3, Z3);
  FE_MUL (X3, c_a, Z3);
  FE_MUL (Y3, c_b3, t2);
  FE_ADD (Y3, X3, Y3);
  FE_SUB (X3, t1, Y3);
  FE_ADD (Y3, t1, Y3);
  FE_MUL (Y3, X3, Y3);
  FE_MUL (X3, t3, X3);
  FE_MUL (Z3, c_b3, Z3);
  FE_MUL (t2, c_a, t2);
  FE_SUB (t3, t0, t2);
  FE_MUL (t3, c_a, t3);
  FE_ADD (t3, t3, Z3);
  FE_ADD (Z3, t0, t0);
  FE_ADD (t0, Z3, t0);
  FE_ADD (t0, t0, t2);
  FE_MUL (t0, t0, t3);
  FE_ADD (Y3, Y3, t0);
  FE_MUL (t2, Y1, Z1);
  FE_ADD (t2, t2, t2);
  FE_MUL (t0, t2, t3);
  FE_SUB (X3, X3, t0);
  FE_MUL (Z3, t2, t1);
  FE_ADD (Z3, Z3, Z3);
  FE_ADD (Z3, Z3, Z3);
#undef X1
#undef Y1
#undef Z1
#undef X3
#undef Y3
#undef Z3
}

// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective.html#addition-add-2015-rcb
static void
add_2015_rcb (ecph ro, const ecph ra, const ecph rb) // ro = ra + rb
{
  fe t0,t1,t2,t3,t4,t5;
#define X1 ra[0]
#define Y1 ra[1]
#define Z1 ra[2]
#define X2 rb[0]
#define Y2 rb[1]
#define Z2 rb[2]
#define X3 ro[0]
#define Y3 ro[1]
#define Z3 ro[2]
  FE_MUL (t0, X1, X2);
  FE_MUL (t1, Y1, Y2);
  FE_MUL (t2, Z1, Z2);
  FE_ADD (t3, X1, Y1);
  FE_ADD (t4, X2, Y2);
  FE_MUL (t3, t3, t4);
  FE_ADD (t4, t0, t1);
  FE_SUB (t3, t3, t4);
  FE_ADD (t4, X1, Z1);
  FE_ADD (t5, X2, Z2);
  FE_MUL (t4, t4, t5);
  FE_ADD (t5, t0, t2);
  FE_SUB (t4, t4, t5);
  FE_ADD (t5, Y1, Z1);
  FE_ADD (X3, Y2, Z2);
  FE_MUL (t5, t5, X3);
  FE_ADD (X3, t1, t2);
  FE_SUB (t5, t5, X3);
  FE_MUL (Z3, c_a, t4);
  FE_MUL (X3, c_b3, t2);
  FE_ADD (Z3, X3, Z3);
  FE_SUB (X3, t1, Z3);
  FE_ADD (Z3, t1, Z3);
  FE_MUL (Y3, X3, Z3);
  FE_ADD (t1, t0, t0);
  FE_ADD (t1, t1, t0);
  FE_MUL (t2, c_a, t2);
  FE_MUL (t4, c_b3, t4);
  FE_ADD (t1, t1, t2);
  FE_SUB (t2, t0, t2);
  FE_MUL (t2, c_a, t2);
  FE_ADD (t4, t4, t2);
  FE_MUL (t0, t1, t4);
  FE_ADD (Y3, Y3, t0);
  FE_MUL (t0, t5, t4);
  FE_MUL (X3, t3, X3);
  FE_SUB (X3, X3, t0);
  FE_MUL (t0, t3, t1);
  FE_MUL (Z3, t5, Z3);
  FE_ADD (Z3, Z3, t0);
#undef X1
#undef Y1
#undef Z1
#undef X2
#undef Y2
#undef Z2
#undef X3
#undef Y3
#undef Z3
};

static void
fe_sat_sign_bit (uint8_t * out1, const uint64_t arg1[5]) {
  uint8_t x1;
  x1 = (uint8_t)((arg1[4]) >> 63);
  *out1 = x1;
}

// compute kG using the left-to-right montgomery ladder.
// input: k[32] is in big-endian form, NOT montgomery form.
//        P is a homogenous elliptic curve point, coordinates in montgomery form.
// output: x coordinate in the montgomery domain.
static void
p256_scalarmult (ecph out, const uint8_t k[32], const ecph P)
{
  ecph r0, r1, r2, r3;
  uint8_t flag = 0;
  // note: the top of the loop moves r0, r1 = r2, r3
  ecph_copy (r2, p_inf);      // r0 = (0 : 1 : 0)
  ecph_copy (r3, P);          // r1 = (x : y : 1)
  for (size_t i = 0; i < 32; i++) {
    uint8_t di = k[i];
    for (size_t j = 0; j < 8; j++) {
      uint8_t bit = (di>>(7-j)) & 1;
      // XXX use cmov here?
      if (!flag && bit) {
        // start the *actual* computation when we see the first high bit.
        ecph_copy (r0, p_inf);
        ecph_copy (r1, P);
        flag = 1;
      } else {
        ecph_copy (r0, r2);
        ecph_copy (r1, r3);
        flag = 1;
      }
      if (bit) {
        add_2015_rcb (r2, r0, r1);
        dbl_2015_rcb (r3, r1);
      } else {
        add_2015_rcb (r3, r0, r1);
        dbl_2015_rcb (r2, r0);
      }
    }
  }
  ecph_copy (out, r2);
}

// Bernstein-Yang Inversion.  divstep is generated by fiat-crypto.
// "High-assurance field inversion for curve-based cryptography"
// https://eprint.iacr.org/2021/549.pdf
// https://github.com/bshvass/fiat-crypto/

// based on `inversion/c/montgomery_template.c`,

static void
inverse (uint64_t out[4], uint64_t g[5])
{
  // NOTE: this is the precomp value from the bshvass fork, which _differs_ from the one generated by the original!
  uint64_t h[4], precomp[4] = {0x8000000, 0, 0, 0};
  // this is the prime, little-endian form
  uint64_t f1[5], g1[5], f[5] = {0xffffffffffffffff, 0xffffffff, 0x0, 0xffffffff00000001, 0x0};
  uint64_t v1[4], v[4] = {0, 0, 0, 0};
  uint64_t r1[4], r[4] = {1, 0, 0, 0};
  uint64_t d, d1;
  uint8_t s;
  d = 1;
  for (int i=0; i < 740; i += 2) { // iterations = 741
    fiat_p256_divstep (&d1, f1, g1, v1, r1, d, f, g, v, r);
    fiat_p256_divstep (&d, f, g, v, r, d1, f1, g1, v1, r1);
  }
  // the iteration count is odd, so this last round isn't optional.
  fiat_p256_divstep (&d1, f1, g1, v1, r1, d, f, g, v, r);
  fe_sat_sign_bit (&s, f1);
  fiat_p256_mul (out, v1, precomp);
  fiat_p256_opp (h, out);
  fiat_p256_selectznz (out, s, out, h);
}

// same, but for the prime secp256r1.n (the group order).
static void
inverse_n (uint64_t out[4], uint64_t g[5])
{
  // NOTE: this is the precomp value from the bshvass fork, which _differs_ from the one generated by the original!
  uint64_t h[4], precomp[4] = {0x8000000, 0, 0, 0};
  // this is the prime, little-endian form
  uint64_t f1[5], g1[5], f[5] = {0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000, 0x0};
  uint64_t v1[4], v[4] = {0, 0, 0, 0};
  uint64_t r1[4], r[4] = {1, 0, 0, 0};
  uint64_t d, d1;
  uint8_t s;
  d = 1;
  for (int i=0; i < 740; i += 2) { // iterations = 741
    fiat_n256_divstep (&d1, f1, g1, v1, r1, d, f, g, v, r);
    fiat_n256_divstep (&d, f, g, v, r, d1, f1, g1, v1, r1);
  }
  // the iteration count is odd, so this last round isn't optional.
  fiat_n256_divstep (&d1, f1, g1, v1, r1, d, f, g, v, r);
  fe_sat_sign_bit (&s, f1);
  fiat_n256_mul (out, v1, precomp);
  fiat_n256_opp (h, out);
  fiat_p256_selectznz (out, s, out, h);
}

// input: P = homogeneous point in montgomery form.
// output: (out_x, out_y) = standard elliptic curve point, coords in montgomery form.

static void
from_homogenous (fe out_x, fe out_y, const ecph P)
{
  // compute x = X/Z = X * Z^-1 and y = Y/Z = Y * Z^-1
  // extra limb need by `inverse`
  uint64_t Zsat[5] = {P[2][0], P[2][1], P[2][2], P[2][3], 0};
  fe Zinv;
  inverse (Zinv, Zsat);
  fiat_p256_mul (out_x, P[0], Zinv);
  fiat_p256_mul (out_y, P[1], Zinv);
}

static void
be_bytes_to_mont_n (fe out, const uint8_t in[32])
{
  fe out0;
  fe_from_bytes (out0, in);
  n256_tm (out, out0);
}

// inputs: arg1, arg2, NOT montgomery form.
// out = (arg1 < arg2)
void
u256_greater_than (uint8_t *out, const fe arg1, const fe arg2)
{
  uint64_t x1,x3,x5,x7;
  uint8_t x2,x4,x6,x8;
  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg2[0]), (arg1[0]));
  fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg2[1]), (arg1[1]));
  fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg2[2]), (arg1[2]));
  fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg2[3]), (arg1[3]));
  *out = (x8 != 0);
}

// input: p256 montgomery in 0 <= x < p
// output: either x or x-n
static void
sub_mod_n (fe out, const fe x)
{
  fe x0, x1;
  uint8_t gt;
  static const fe N = {0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000};
  p256_fm (x0, x);
  u256_greater_than (&gt, x0, N);
  fiat_p256_sub (x1, x, c_n);
  fiat_p256_selectznz (out, gt, x, x1);
}

// inputs: all are in BIG endian form.
//     sk (secret key)
//     k (nonce/rfc6979)
//     h (hashed msg)
// outputs: in BIG endian form.
//     out_r, out_s

void
p256_ecdsa_sign (uint8_t out_r[32], uint8_t out_s[32],
                 const uint8_t sk[32], const uint8_t k[32], const uint8_t h[32])
{
  ecph kG;
  fe kG_x, kG_y;
  p256_scalarmult (kG, k, p_g);
  from_homogenous (kG_x, kG_y, kG);
  fe xm_n, xm_s;
  p256_fm (xm_s, kG_x);
  n256_tm (xm_n, xm_s);
  fe km;
  fe kinv;
  fe sk_m;
  fe zm, rm, sm;
  fe t0, t1, t2;
  fe r0, s0;
  sub_mod_n (rm, xm_n);
  be_bytes_to_mont_n (km, k);
  uint64_t ksat[5] = {km[0], km[1], km[2], km[3], 0};
  inverse_n (kinv, ksat);
  be_bytes_to_mont_n (sk_m, sk);
  be_bytes_to_mont_n (zm, h);
  fiat_n256_mul (t0, rm, sk_m);
  fiat_n256_add (t1, zm, t0);
  fiat_n256_mul (t2, t1, kinv);
  sub_mod_n (sm, t2);
  n256_fm (r0, rm);
  n256_fm (s0, sm);
  fe_to_bytes (out_r, r0);
  fe_to_bytes (out_s, s0);
}