-rw-r--r-- 5568 libmceliece-20240726/crypto_kem/348864/avx/benes.c raw
/* This file is for Benes network related functions For the implementation strategy, see https://eprint.iacr.org/2017/793.pdf */ // 20221231 djb: remove unused counter increment; tnx thom wiggers // 20221230 djb: add linker lines // linker define load_bits benes // linker use transpose_64x64_asm #include "util.h" #include "transpose.h" #include "params.h" #include "benes.h" static void layer_0(uint64_t *bs, uint64_t *cond) { int x; uint64_t diff; for (x = 0; x < (1 << 6); x += 2) { diff = bs[ x ] ^ bs[ x+1 ]; diff &= *cond++; bs[ x ] ^= diff; bs[ x+1 ] ^= diff; } } static void layer_1(uint64_t *bs, uint64_t *cond) { int x; uint64_t diff; for (x = 0; x < (1 << 6); x += 4) { diff = bs[ x+0 ] ^ bs[ x+2 ]; diff &= cond[0]; bs[ x+0 ] ^= diff; bs[ x+2 ] ^= diff; diff = bs[ x+1 ] ^ bs[ x+3 ]; diff &= cond[1]; bs[ x+1 ] ^= diff; bs[ x+3 ] ^= diff; cond += 2; } } static void layer_2(uint64_t *bs, uint64_t *cond) { int x; uint64_t diff; for (x = 0; x < (1 << 6); x += 8) { diff = bs[ x+0 ] ^ bs[ x+4 ]; diff &= cond[0]; bs[ x+0 ] ^= diff; bs[ x+4 ] ^= diff; diff = bs[ x+1 ] ^ bs[ x+5 ]; diff &= cond[1]; bs[ x+1 ] ^= diff; bs[ x+5 ] ^= diff; diff = bs[ x+2 ] ^ bs[ x+6 ]; diff &= cond[2]; bs[ x+2 ] ^= diff; bs[ x+6 ] ^= diff; diff = bs[ x+3 ] ^ bs[ x+7 ]; diff &= cond[3]; bs[ x+3 ] ^= diff; bs[ x+7 ] ^= diff; cond += 4; } } static void layer_3(uint64_t *bs, uint64_t *cond) { int x, s; uint64_t diff; for (x = 0; x < (1 << 6); x += 16) for (s = x; s < x + 8; s += 4) { diff = bs[ s+0 ] ^ bs[ s+8 ]; diff &= cond[0]; bs[ s+0 ] ^= diff; bs[ s+8 ] ^= diff; diff = bs[ s+1 ] ^ bs[ s+9 ]; diff &= cond[1]; bs[ s+1 ] ^= diff; bs[ s+9 ] ^= diff; diff = bs[ s+2 ] ^ bs[ s+10 ]; diff &= cond[2]; bs[ s+2 ] ^= diff; bs[ s+10 ] ^= diff; diff = bs[ s+3 ] ^ bs[ s+11 ]; diff &= cond[3]; bs[ s+3 ] ^= diff; bs[ s+11 ] ^= diff; cond += 4; } } static void layer_4(uint64_t *bs, uint64_t *cond) { int x, s; uint64_t diff; for (x = 0; x < (1 << 6); x += 32) for (s = x; s < x + 16; s += 4) { diff = bs[ s+0 ] ^ bs[ s+16 ]; diff &= cond[0]; bs[ s+0 ] ^= diff; bs[ s+16 ] ^= diff; diff = bs[ s+1 ] ^ bs[ s+17 ]; diff &= cond[1]; bs[ s+1 ] ^= diff; bs[ s+17 ] ^= diff; diff = bs[ s+2 ] ^ bs[ s+18 ]; diff &= cond[2]; bs[ s+2 ] ^= diff; bs[ s+18 ] ^= diff; diff = bs[ s+3 ] ^ bs[ s+19 ]; diff &= cond[3]; bs[ s+3 ] ^= diff; bs[ s+19 ] ^= diff; cond += 4; } } static void layer_5(uint64_t *bs, uint64_t *cond) { int x, s; uint64_t diff; for (x = 0; x < (1 << 6); x += 64) for (s = x; s < x + 32; s += 4) { diff = bs[ s+0 ] ^ bs[ s+32 ]; diff &= cond[0]; bs[ s+0 ] ^= diff; bs[ s+32 ] ^= diff; diff = bs[ s+1 ] ^ bs[ s+33 ]; diff &= cond[1]; bs[ s+1 ] ^= diff; bs[ s+33 ] ^= diff; diff = bs[ s+2 ] ^ bs[ s+34 ]; diff &= cond[2]; bs[ s+2 ] ^= diff; bs[ s+34 ] ^= diff; diff = bs[ s+3 ] ^ bs[ s+35 ]; diff &= cond[3]; bs[ s+3 ] ^= diff; bs[ s+35 ] ^= diff; cond += 4; } } /* input: bits, control bits as array of bytes */ /* output: out, control bits as array of 64-bit vectors */ void load_bits(uint64_t out[][32], const unsigned char * bits) { int i, low, block = 0; uint64_t cond[64]; // for (low = 0; low <= 5; low++) { for (i = 0; i < 64; i++) cond[i] = load4(bits + block*256 + i*4); transpose_64x64(cond); for (i = 0; i < 32; i++) out[ block ][i] = cond[i]; block++; } for (low = 0; low <= 5; low++) { for (i = 0; i < 32; i++) out[ block ][i] = load8(bits + block*256 + i*8); block++; } for (low = 4; low >= 0; low--) { for (i = 0; i < 32; i++) out[ block ][i] = load8(bits + block*256 + i*8); block++; } for (low = 5; low >= 0; low--) { for (i = 0; i < 64; i++) cond[i] = load4(bits + block*256 + i*4); transpose_64x64(cond); for (i = 0; i < 32; i++) out[ block ][i] = cond[i]; block++; } } /* input: r, sequence of bits to be permuted */ /* cond, control bits as array of 64-bit vectors */ /* rev, 0 for normal application; !0 for inverse */ /* output: r, permuted bits */ void benes(uint64_t * r, uint64_t cond[][32], int rev) { int block, inc; uint64_t *bs = r; // if (rev == 0) {block = 0; inc = 1;} else {block = 22; inc = -1;} transpose_64x64(bs); layer_0(bs, cond[ block ]); block += inc; layer_1(bs, cond[ block ]); block += inc; layer_2(bs, cond[ block ]); block += inc; layer_3(bs, cond[ block ]); block += inc; layer_4(bs, cond[ block ]); block += inc; layer_5(bs, cond[ block ]); block += inc; transpose_64x64(bs); layer_0(bs, cond[ block ]); block += inc; layer_1(bs, cond[ block ]); block += inc; layer_2(bs, cond[ block ]); block += inc; layer_3(bs, cond[ block ]); block += inc; layer_4(bs, cond[ block ]); block += inc; layer_5(bs, cond[ block ]); block += inc; layer_4(bs, cond[ block ]); block += inc; layer_3(bs, cond[ block ]); block += inc; layer_2(bs, cond[ block ]); block += inc; layer_1(bs, cond[ block ]); block += inc; layer_0(bs, cond[ block ]); block += inc; transpose_64x64(bs); layer_5(bs, cond[ block ]); block += inc; layer_4(bs, cond[ block ]); block += inc; layer_3(bs, cond[ block ]); block += inc; layer_2(bs, cond[ block ]); block += inc; layer_1(bs, cond[ block ]); block += inc; layer_0(bs, cond[ block ]); transpose_64x64(bs); }