-rw-r--r-- 6180 libntruprime-20240910/crypto_encode/1277x7879/avx/encode.c raw
/* auto-generated; do not edit */ #include <immintrin.h> #include "crypto_encode.h" #include "crypto_int16.h" #include "crypto_uint16.h" #include "crypto_uint32.h" #define int16 crypto_int16 #define uint16 crypto_uint16 #define uint32 crypto_uint32 void crypto_encode(unsigned char *out,const void *v) { const int16 *R0 = v; /* XXX: caller could overlap R with input */ uint16 R[639]; long i; const uint16 *reading; uint16 *writing; uint16 r0,r1; uint32 r2; reading = (uint16 *) R0; writing = R; i = 40; while (i > 0) { __m256i x,x2,y,y2; --i; if (!i) { reading -= 4; writing -= 2; out -= 4; } x = _mm256_loadu_si256((__m256i *) (reading+0)); x2 = _mm256_loadu_si256((__m256i *) (reading+16)); x = _mm256_add_epi16(x,_mm256_set1_epi16(3939)); x2 = _mm256_add_epi16(x2,_mm256_set1_epi16(3939)); x &= _mm256_set1_epi16(16383); x2 &= _mm256_set1_epi16(16383); y = x & _mm256_set1_epi32(65535); y2 = x2 & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x,16); x2 = _mm256_srli_epi32(x2,16); x = _mm256_mullo_epi32(x,_mm256_set1_epi32(7879)); x2 = _mm256_mullo_epi32(x2,_mm256_set1_epi32(7879)); x = _mm256_add_epi32(y,x); x2 = _mm256_add_epi32(y2,x2); x = _mm256_shuffle_epi8(x,_mm256_set_epi8( 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0 )); x2 = _mm256_shuffle_epi8(x2,_mm256_set_epi8( 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0 )); x = _mm256_permute4x64_epi64(x,0xd8); x2 = _mm256_permute4x64_epi64(x2,0xd8); _mm256_storeu_si256((__m256i *) writing,_mm256_permute2f128_si256(x,x2,0x31)); _mm256_storeu_si256((__m256i *) out,_mm256_permute2f128_si256(x,x2,0x20)); reading += 32; writing += 16; out += 32; } R[638] = ((R0[1276]+3939)&16383); reading = (uint16 *) R; writing = R; i = 40; while (i > 0) { __m256i x,y; --i; if (!i) { reading -= 2; writing -= 1; out -= 1; } x = _mm256_loadu_si256((__m256i *) reading); y = x & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x,16); x = _mm256_mullo_epi32(x,_mm256_set1_epi32(948)); x = _mm256_add_epi32(y,x); x = _mm256_shuffle_epi8(x,_mm256_set_epi8( 12,8,4,0,12,8,4,0,14,13,10,9,6,5,2,1, 12,8,4,0,12,8,4,0,14,13,10,9,6,5,2,1 )); x = _mm256_permute4x64_epi64(x,0xd8); _mm_storeu_si128((__m128i *) writing,_mm256_extractf128_si256(x,0)); *((uint32 *) (out+0)) = _mm256_extract_epi32(x,4); *((uint32 *) (out+4)) = _mm256_extract_epi32(x,6); reading += 16; writing += 8; out += 8; } R[319] = R[638]; reading = (uint16 *) R; writing = R; i = 10; while (i > 0) { __m256i x,x2,y,y2; --i; x = _mm256_loadu_si256((__m256i *) (reading+0)); x2 = _mm256_loadu_si256((__m256i *) (reading+16)); y = x & _mm256_set1_epi32(65535); y2 = x2 & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x,16); x2 = _mm256_srli_epi32(x2,16); x = _mm256_mullo_epi32(x,_mm256_set1_epi32(3511)); x2 = _mm256_mullo_epi32(x2,_mm256_set1_epi32(3511)); x = _mm256_add_epi32(y,x); x2 = _mm256_add_epi32(y2,x2); x = _mm256_shuffle_epi8(x,_mm256_set_epi8( 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0 )); x2 = _mm256_shuffle_epi8(x2,_mm256_set_epi8( 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0 )); x = _mm256_permute4x64_epi64(x,0xd8); x2 = _mm256_permute4x64_epi64(x2,0xd8); _mm256_storeu_si256((__m256i *) writing,_mm256_permute2f128_si256(x,x2,0x31)); _mm256_storeu_si256((__m256i *) out,_mm256_permute2f128_si256(x,x2,0x20)); reading += 32; writing += 16; out += 32; } reading = (uint16 *) R; writing = R; i = 10; while (i > 0) { __m256i x,y; --i; x = _mm256_loadu_si256((__m256i *) reading); y = x & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x,16); x = _mm256_mullo_epi32(x,_mm256_set1_epi32(189)); x = _mm256_add_epi32(y,x); x = _mm256_shuffle_epi8(x,_mm256_set_epi8( 12,8,4,0,12,8,4,0,14,13,10,9,6,5,2,1, 12,8,4,0,12,8,4,0,14,13,10,9,6,5,2,1 )); x = _mm256_permute4x64_epi64(x,0xd8); _mm_storeu_si128((__m128i *) writing,_mm256_extractf128_si256(x,0)); *((uint32 *) (out+0)) = _mm256_extract_epi32(x,4); *((uint32 *) (out+4)) = _mm256_extract_epi32(x,6); reading += 16; writing += 8; out += 8; } reading = (uint16 *) R; writing = R; i = 5; while (i > 0) { __m256i x,y; --i; x = _mm256_loadu_si256((__m256i *) reading); y = x & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x,16); x = _mm256_mullo_epi32(x,_mm256_set1_epi32(140)); x = _mm256_add_epi32(y,x); x = _mm256_shuffle_epi8(x,_mm256_set_epi8( 12,8,4,0,12,8,4,0,14,13,10,9,6,5,2,1, 12,8,4,0,12,8,4,0,14,13,10,9,6,5,2,1 )); x = _mm256_permute4x64_epi64(x,0xd8); _mm_storeu_si128((__m128i *) writing,_mm256_extractf128_si256(x,0)); *((uint32 *) (out+0)) = _mm256_extract_epi32(x,4); *((uint32 *) (out+4)) = _mm256_extract_epi32(x,6); reading += 16; writing += 8; out += 8; } for (i = 0;i < 20;++i) { r0 = R[2*i]; r1 = R[2*i+1]; r2 = r0+r1*(uint32)77; R[i] = r2; } for (i = 0;i < 10;++i) { r0 = R[2*i]; r1 = R[2*i+1]; r2 = r0+r1*(uint32)5929; *out++ = r2; r2 >>= 8; *out++ = r2; r2 >>= 8; R[i] = r2; } for (i = 0;i < 5;++i) { r0 = R[2*i]; r1 = R[2*i+1]; r2 = r0+r1*(uint32)537; *out++ = r2; r2 >>= 8; R[i] = r2; } for (i = 0;i < 2;++i) { r0 = R[2*i]; r1 = R[2*i+1]; r2 = r0+r1*(uint32)1127; *out++ = r2; r2 >>= 8; R[i] = r2; } R[2] = R[4]; r0 = R[0]; r1 = R[1]; r2 = r0+r1*(uint32)4962; *out++ = r2; r2 >>= 8; *out++ = r2; r2 >>= 8; R[0] = r2; R[1] = R[2]; r0 = R[0]; r1 = R[1]; r2 = r0+r1*(uint32)376; *out++ = r2; r2 >>= 8; R[0] = r2; r0 = R[0]; *out++ = r0; r0 >>= 8; *out++ = r0; r0 >>= 8; }