-rw-r--r-- 6026 libmceliece-20240812/crypto_kem/348864/avx/controlbits.c raw
/* This file is for implementing the Nassimi-Sahni algorithm */
/* See David Nassimi, Sartaj Sahni "Parallel algorithms to set up the Benes permutationnetwork" */
/* See also https://cr.yp.to/papers/controlbits-20200923.pdf */
// 20240805 djb: more mask usage
// 20240508 djb: switch to crypto_sort_int32
// 20221230 djb: add linker line
// linker define controlbitsfrompermutation
#include <string.h>
#include "crypto_declassify.h"
#include "controlbits.h"
#include "crypto_sort_int32.h"
typedef int16_t int16;
typedef int32_t int32;
#include "crypto_int32.h"
#define int32_min crypto_int32_min
#include "crypto_int16.h"
#include "crypto_int8.h"
/* parameters: 1 <= w <= 14; n = 2^w */
/* input: permutation pi of {0,1,...,n-1} */
/* output: (2m-1)n/2 control bits at positions pos,pos+step,... */
/* output position pos is by definition 1&(out[pos/8]>>(pos&7)) */
/* caller must 0-initialize positions first */
/* temp must have space for int32[2*n] */
static void cbrecursion(unsigned char *out,long long pos,long long step,const int16 *pi,long long w,long long n,int32 *temp)
{
#define A temp
#define B (temp+n)
#define q ((int16 *) (temp+n+n/4))
/* q can start anywhere between temp+n and temp+n/2 */
long long x,i,j,k;
if (w == 1) {
out[pos>>3] ^= pi[0]<<(pos&7);
return;
}
for (x = 0;x < n;++x) A[x] = ((pi[x]^1)<<16)|pi[x^1];
crypto_sort_int32(A,n); /* A = (id<<16)+pibar */
for (x = 0;x < n;++x) {
int32 Ax = A[x];
int32 px = Ax&0xffff;
int32 cx = int32_min(px,x);
B[x] = (px<<16)|cx;
}
/* B = (p<<16)+c */
for (x = 0;x < n;++x) A[x] = (A[x]<<16)|x; /* A = (pibar<<16)+id */
crypto_sort_int32(A,n); /* A = (id<<16)+pibar^-1 */
for (x = 0;x < n;++x) A[x] = (A[x]<<16)+(B[x]>>16); /* A = (pibar^(-1)<<16)+pibar */
crypto_sort_int32(A,n); /* A = (id<<16)+pibar^2 */
if (w <= 10) {
for (x = 0;x < n;++x) B[x] = ((A[x]&0xffff)<<10)|(B[x]&0x3ff);
for (i = 1;i < w-1;++i) {
/* B = (p<<10)+c */
for (x = 0;x < n;++x) A[x] = ((B[x]&~0x3ff)<<6)|x; /* A = (p<<16)+id */
crypto_sort_int32(A,n); /* A = (id<<16)+p^{-1} */
for (x = 0;x < n;++x) A[x] = (A[x]<<20)|B[x]; /* A = (p^{-1}<<20)+(p<<10)+c */
crypto_sort_int32(A,n); /* A = (id<<20)+(pp<<10)+cp */
for (x = 0;x < n;++x) {
int32 ppcpx = A[x]&0xfffff;
int32 ppcx = (A[x]&0xffc00)|(B[x]&0x3ff);
B[x] = int32_min(ppcx,ppcpx);
}
}
for (x = 0;x < n;++x) B[x] &= 0x3ff;
} else {
for (x = 0;x < n;++x) B[x] = (A[x]<<16)|(B[x]&0xffff);
for (i = 1;i < w-1;++i) {
/* B = (p<<16)+c */
for (x = 0;x < n;++x) A[x] = (B[x]&~0xffff)|x;
crypto_sort_int32(A,n); /* A = (id<<16)+p^(-1) */
for (x = 0;x < n;++x) A[x] = (A[x]<<16)|(B[x]&0xffff);
/* A = p^(-1)<<16+c */
if (i < w-2) {
for (x = 0;x < n;++x) B[x] = (A[x]&~0xffff)|(B[x]>>16);
/* B = (p^(-1)<<16)+p */
crypto_sort_int32(B,n); /* B = (id<<16)+p^(-2) */
for (x = 0;x < n;++x) B[x] = (B[x]<<16)|(A[x]&0xffff);
/* B = (p^(-2)<<16)+c */
}
crypto_sort_int32(A,n);
/* A = id<<16+cp */
for (x = 0;x < n;++x) {
int32 cpx = (B[x]&~0xffff)|(A[x]&0xffff);
B[x] = int32_min(B[x],cpx);
}
}
for (x = 0;x < n;++x) B[x] &= 0xffff;
}
for (x = 0;x < n;++x) A[x] = (((int32)pi[x])<<16)+x;
crypto_sort_int32(A,n); /* A = (id<<16)+pi^(-1) */
for (j = 0;j < n/2;++j) {
long long x = 2*j;
int32 fj = crypto_int32_bottombit_01(B[x]); /* f[j] */
int32 Fx = x+fj; /* F[x] */
int32 Fx1 = Fx^1; /* F[x+1] */
out[pos>>3] ^= fj<<(pos&7);
pos += step;
B[x] = (A[x]<<16)|Fx;
B[x+1] = (A[x+1]<<16)|Fx1;
}
/* B = (pi^(-1)<<16)+F */
crypto_sort_int32(B,n); /* B = (id<<16)+F(pi) */
pos += (2*w-3)*step*(n/2);
for (k = 0;k < n/2;++k) {
long long y = 2*k;
int32 lk = crypto_int32_bottombit_01(B[y]); /* l[k] */
int32 Ly = y+lk; /* L[y] */
int32 Ly1 = Ly^1; /* L[y+1] */
out[pos>>3] ^= lk<<(pos&7);
pos += step;
A[y] = (Ly<<16)|(B[y]&0xffff);
A[y+1] = (Ly1<<16)|(B[y+1]&0xffff);
}
/* A = (L<<16)+F(pi) */
crypto_sort_int32(A,n); /* A = (id<<16)+F(pi(L)) = (id<<16)+M */
pos -= (2*w-2)*step*(n/2);
for (j = 0;j < n/2;++j) {
q[j] = (A[2*j]&0xffff)>>1;
q[j+n/2] = (A[2*j+1]&0xffff)>>1;
}
cbrecursion(out,pos,step*2,q,w-1,n/2,temp);
cbrecursion(out,pos+step,step*2,q+n/2,w-1,n/2,temp);
}
/* input: p, an array of int16 */
/* input: n, length of p */
/* input: s, meaning that stride-2^s cswaps are performed */
/* input: cb, the control bits */
/* output: the result of apply the control bits to p */
static void layer(int16_t *p, const unsigned char *cb, int s, int n)
{
int i, j;
int stride = 1 << s;
int index = 0;
int16_t d;
for (i = 0; i < n; i += stride*2)
{
for (j = 0; j < stride; j++)
{
d = p[ i+j ] ^ p[ i+j+stride ];
d &= crypto_int8_bitmod_mask(cb[ index >> 3 ], index);
p[ i+j ] ^= d;
p[ i+j+stride ] ^= d;
index++;
}
}
}
/* parameters: 1 <= w <= 14; n = 2^w */
/* input: permutation pi of {0,1,...,n-1} */
/* output: (2m-1)n/2 control bits at positions 0,1,... */
/* output position pos is by definition 1&(out[pos/8]>>(pos&7)) */
void controlbitsfrompermutation(unsigned char *out,const int16 *pi,long long w,long long n)
{
int32 temp[2*n];
int16 pi_test[n], diff;
int i;
unsigned char *ptr;
while (1)
{
memset(out,0,(((2*w-1)*n/2)+7)/8);
cbrecursion(out,0,1,pi,w,n,temp);
// check for correctness
for (i = 0; i < n; i++)
pi_test[i] = i;
ptr = out;
for (i = 0; i < w; i++)
{
layer(pi_test, ptr, i, n);
ptr += n >> 4;
}
for (i = w-2; i >= 0; i--)
{
layer(pi_test, ptr, i, n);
ptr += n >> 4;
}
diff = 0;
for (i = 0; i < n; i++)
diff |= pi[i] ^ pi_test[i];
diff = crypto_int16_nonzero_mask(diff);
crypto_declassify(&diff,sizeof diff);
if (diff == 0)
break;
}
}