-rw-r--r-- 2542 libmceliece-20240726/crypto_xof/bitwrite16/256x4/write.c raw
// 20240504 djb #include <string.h> #include <immintrin.h> #include "crypto_int64.h" #include "crypto_uint64.h" #include "crypto_uint16.h" #include "crypto_xof.h" static inline __m256i vec256_equal_mask(crypto_uint16 x,crypto_uint16 y) { crypto_int64 mask = x^y; mask -= 1; mask >>= 37; // above 16; try to dodge "smart" compilers return _mm256_set1_epi64x(mask); } static inline __m256i vec256_1shlmod(crypto_uint16 pos) { crypto_uint64 word64 = crypto_uint64_shlmod(1,pos); crypto_uint64 mask6 = -(1&(pos>>6)); __m128i word128 = _mm_set_epi64x(word64&mask6,word64&~mask6); __m128i mask7 = _mm_set1_epi64x(-(1&(pos>>7))); __m256i word256 = _mm256_set_m128i(word128&mask7,word128&~mask7); return word256; } static void atmost8192( unsigned char *h,long long hlen, const unsigned char *m,long long mlen ) { long long i; long long words = hlen>>5; __m256i hlast = _mm256_set1_epi64x(0); for (i = 0;i < 32*words;++i) h[i] = 0; while (mlen >= 2) { crypto_uint16 pos = crypto_uint16_load(m); __m256i word256 = vec256_1shlmod(pos); for (i = 0;i + 4 <= words;i += 4) { __m256i hi0 = _mm256_loadu_si256((__m256i *) (h+32*i)); __m256i hi1 = _mm256_loadu_si256((__m256i *) (h+32*i+32)); __m256i hi2 = _mm256_loadu_si256((__m256i *) (h+32*i+64)); __m256i hi3 = _mm256_loadu_si256((__m256i *) (h+32*i+96)); __m256i base = word256 & vec256_equal_mask(i>>2,pos>>10); __m256i mask9 = _mm256_set1_epi64x(-(1 & (pos>>9))); __m256i base90 = base & ~mask9; __m256i base91 = base & mask9; __m256i mask8 = _mm256_set1_epi64x(-(1 & (pos>>8))); hi0 |= base90 & ~mask8; hi1 |= base90 & mask8; hi2 |= base91 & ~mask8; hi3 |= base91 & mask8; _mm256_storeu_si256((__m256i *) (h+32*i),hi0); _mm256_storeu_si256((__m256i *) (h+32*i+32),hi1); _mm256_storeu_si256((__m256i *) (h+32*i+64),hi2); _mm256_storeu_si256((__m256i *) (h+32*i+96),hi3); } for (;i < words;++i) { __m256i hi = _mm256_loadu_si256((__m256i *) (h+32*i)); hi |= word256 & vec256_equal_mask(i,pos>>8); _mm256_storeu_si256((__m256i *) (h+32*i),hi); } hlast |= word256 & vec256_equal_mask(words,pos>>8); m += 2; mlen -= 2; } memcpy(h+32*words,&hlast,hlen-32*words); } void crypto_xof( unsigned char *h,long long hlen, const unsigned char *m,long long mlen ) { while (hlen > 8192) { atmost8192(h,8192,m,mlen); h += 8192; hlen -= 8192; } if (hlen > 0) atmost8192(h,hlen,m,mlen); }