-rw-r--r-- 14324 libntruprime-20240825/crypto_decode/1013x7177/avx/decode.c raw
/* auto-generated; do not edit */
/* 20240812 djb: more cryptoint usage */
#include <immintrin.h>
#include "crypto_decode.h"
#include "crypto_int16.h"
#include "crypto_int32.h"
#define int16 crypto_int16
#define int32 crypto_int32
static inline int16 mullo(int16 x,int16 y)
{
return x*y;
}
static inline int16 mulhi(int16 x,int16 y)
{
return (x*(int32)y)>>16;
}
static inline __m256i add(__m256i x,__m256i y)
{
return _mm256_add_epi16(x,y);
}
static inline __m256i sub(__m256i x,__m256i y)
{
return _mm256_sub_epi16(x,y);
}
static inline __m256i shiftleftconst(__m256i x,int16 y)
{
return _mm256_slli_epi16(x,y);
}
static inline __m256i signedshiftrightconst(__m256i x,int16 y)
{
return _mm256_srai_epi16(x,y);
}
static inline __m256i subconst(__m256i x,int16 y)
{
return sub(x,_mm256_set1_epi16(y));
}
static inline __m256i mulloconst(__m256i x,int16 y)
{
return _mm256_mullo_epi16(x,_mm256_set1_epi16(y));
}
static inline __m256i mulhiconst(__m256i x,int16 y)
{
return _mm256_mulhi_epi16(x,_mm256_set1_epi16(y));
}
static inline __m256i ifgesubconst(__m256i x,int16 y)
{
__m256i y16 = _mm256_set1_epi16(y);
__m256i top16 = _mm256_set1_epi16(y-1);
return sub(x,_mm256_cmpgt_epi16(x,top16) & y16);
}
static inline __m256i ifnegaddconst(__m256i x,int16 y)
{
return add(x,signedshiftrightconst(x,15) & _mm256_set1_epi16(y));
}
void crypto_decode(void *v,const unsigned char *s)
{
int16 *R0 = v;
int16 R1[507],R2[254],R3[127],R4[64],R5[32],R6[16],R7[8],R8[4],R9[2],R10[1];
long long i;
int16 a0,a1,a2;
__m256i A0,A1,A2,S0,S1,B0,B1,C0,C1;
s += crypto_decode_STRBYTES;
a1 = 0;
a1 += *--s; /* 0...255 */
a1 = mulhi(a1,-78)-mulhi(mullo(a1,4305),274);
a1 += *--s; /* -137...391 */
a1 -= 274; /* -411...117 */
a1 += 274&crypto_int16_negative_mask(a1); /* -137...273 */
a1 += 274&crypto_int16_negative_mask(a1); /* 0...273 */
R10[0] = a1;
/* R10 ------> R9: reconstruct mod 1*[91]+[769] */
i = 0;
s -= 1;
a2 = a0 = R10[0];
a0 = mulhi(a0,1)-mulhi(mullo(a0,12243),91); /* -46...45 */
a0 += s[1*i+0]; /* -46...300 */
a0 = mulhi(a0,16)-mulhi(mullo(a0,-720),91); /* -46...45 */
a0 += 91&crypto_int16_negative_mask(a0); /* 0...90 */
a1 = (a2<<8)+s[i]-a0;
a1 = mullo(a1,12243);
/* invalid inputs might need reduction mod 769 */
a1 -= 769;
a1 += 769&crypto_int16_negative_mask(a1);
R9[0] = a0;
R9[1] = a1;
s -= 0;
/* R9 ------> R8: reconstruct mod 3*[152]+[1294] */
i = 0;
s -= 1;
a2 = a0 = R9[1];
a0 = mulhi(a0,64)-mulhi(mullo(a0,20696),152); /* -76...92 */
a0 += s[1*i+0]; /* -76...347 */
a0 = mulhi(a0,24)-mulhi(mullo(a0,-431),152); /* -77...76 */
a0 += 152&crypto_int16_negative_mask(a0); /* 0...151 */
a1 = (a2<<5)+((s[i]-a0)>>3);
a1 = mullo(a1,-13797);
/* invalid inputs might need reduction mod 1294 */
a1 -= 1294;
a1 += 1294&crypto_int16_negative_mask(a1);
R8[2] = a0;
R8[3] = a1;
s -= 1;
for (i = 0;i >= 0;--i) {
a2 = a0 = R9[i];
a0 = mulhi(a0,64)-mulhi(mullo(a0,20696),152); /* -76...92 */
a0 += s[1*i+0]; /* -76...347 */
a0 = mulhi(a0,24)-mulhi(mullo(a0,-431),152); /* -77...76 */
a0 += 152&crypto_int16_negative_mask(a0); /* 0...151 */
a1 = (a2<<5)+((s[i]-a0)>>3);
a1 = mullo(a1,-13797);
/* invalid inputs might need reduction mod 152 */
a1 -= 152;
a1 += 152&crypto_int16_negative_mask(a1);
R8[2*i] = a0;
R8[2*i+1] = a1;
}
/* R8 ------> R7: reconstruct mod 7*[197]+[1681] */
i = 0;
s -= 1;
a2 = a0 = R8[3];
a0 = mulhi(a0,-92)-mulhi(mullo(a0,-19628),197); /* -122...98 */
a0 += s[1*i+0]; /* -122...353 */
a0 -= 197; /* -319..>156 */
a0 += 197&crypto_int16_negative_mask(a0); /* -122...196 */
a0 += 197&crypto_int16_negative_mask(a0); /* 0...196 */
a1 = (a2<<8)+s[i]-a0;
a1 = mullo(a1,32269);
/* invalid inputs might need reduction mod 1681 */
a1 -= 1681;
a1 += 1681&crypto_int16_negative_mask(a1);
R7[6] = a0;
R7[7] = a1;
s -= 3;
for (i = 2;i >= 0;--i) {
a2 = a0 = R8[i];
a0 = mulhi(a0,-92)-mulhi(mullo(a0,-19628),197); /* -122...98 */
a0 += s[1*i+0]; /* -122...353 */
a0 -= 197; /* -319..>156 */
a0 += 197&crypto_int16_negative_mask(a0); /* -122...196 */
a0 += 197&crypto_int16_negative_mask(a0); /* 0...196 */
a1 = (a2<<8)+s[i]-a0;
a1 = mullo(a1,32269);
/* invalid inputs might need reduction mod 197 */
a1 -= 197;
a1 += 197&crypto_int16_negative_mask(a1);
R7[2*i] = a0;
R7[2*i+1] = a1;
}
/* R7 ------> R6: reconstruct mod 15*[3586]+[120] */
i = 0;
s -= 1;
a2 = a0 = R7[7];
a0 = mulhi(a0,-1678)-mulhi(mullo(a0,-4679),3586); /* -2213...1793 */
a0 += s[1*i+0]; /* -2213...2048 */
a0 += 3586&crypto_int16_negative_mask(a0); /* 0...3585 */
a1 = (a2<<7)+((s[i]-a0)>>1);
a1 = mullo(a1,-1791);
/* invalid inputs might need reduction mod 120 */
a1 -= 120;
a1 += 120&crypto_int16_negative_mask(a1);
R6[14] = a0;
R6[15] = a1;
s -= 14;
for (i = 6;i >= 0;--i) {
a2 = a0 = R7[i];
a0 = mulhi(a0,-1678)-mulhi(mullo(a0,-4679),3586); /* -2213...1793 */
a0 += s[2*i+1]; /* -2213...2048 */
a0 = mulhi(a0,-1678)-mulhi(mullo(a0,-4679),3586); /* -1846...1849 */
a0 += s[2*i+0]; /* -1846...2104 */
a0 += 3586&crypto_int16_negative_mask(a0); /* 0...3585 */
a1 = (a2<<15)+(s[2*i+1]<<7)+((s[2*i]-a0)>>1);
a1 = mullo(a1,-1791);
/* invalid inputs might need reduction mod 3586 */
a1 -= 3586;
a1 += 3586&crypto_int16_negative_mask(a1);
R6[2*i] = a0;
R6[2*i+1] = a1;
}
/* R6 ------> R5: reconstruct mod 31*[958]+[8200] */
i = 0;
s -= 2;
a2 = a0 = R6[15];
a0 = mulhi(a0,-238)-mulhi(mullo(a0,-17513),958); /* -539...479 */
a0 += s[2*i+1]; /* -539...734 */
a0 = mulhi(a0,-238)-mulhi(mullo(a0,-17513),958); /* -482...480 */
a0 += s[2*i+0]; /* -482...735 */
a0 += 958&crypto_int16_negative_mask(a0); /* 0...957 */
a1 = (a2<<15)+(s[2*i+1]<<7)+((s[2*i]-a0)>>1);
a1 = mullo(a1,-1505);
/* invalid inputs might need reduction mod 8200 */
a1 -= 8200;
a1 += 8200&crypto_int16_negative_mask(a1);
R5[30] = a0;
R5[31] = a1;
s -= 15;
for (i = 14;i >= 0;--i) {
a2 = a0 = R6[i];
a0 = mulhi(a0,-238)-mulhi(mullo(a0,-17513),958); /* -539...479 */
a0 += s[1*i+0]; /* -539...734 */
a0 += 958&crypto_int16_negative_mask(a0); /* 0...957 */
a1 = (a2<<7)+((s[i]-a0)>>1);
a1 = mullo(a1,-1505);
/* invalid inputs might need reduction mod 958 */
a1 -= 958;
a1 += 958&crypto_int16_negative_mask(a1);
R5[2*i] = a0;
R5[2*i+1] = a1;
}
/* R5 ------> R4: reconstruct mod 63*[7921]+[265] */
i = 0;
s -= 1;
a2 = a0 = R5[31];
a0 = mulhi(a0,538)-mulhi(mullo(a0,-2118),7921); /* -3961...4095 */
a0 += s[1*i+0]; /* -3961...4350 */
a0 += 7921&crypto_int16_negative_mask(a0); /* 0...7920 */
a1 = (a2<<8)+s[i]-a0;
a1 = mullo(a1,4625);
/* invalid inputs might need reduction mod 265 */
a1 -= 265;
a1 += 265&crypto_int16_negative_mask(a1);
R4[62] = a0;
R4[63] = a1;
s -= 62;
i = 15;
for (;;) {
A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]);
S0 = _mm256_loadu_si256((__m256i *) (s+2*i));
S1 = _mm256_srli_epi16(S0,8);
S0 &= _mm256_set1_epi16(255);
A0 = sub(mulhiconst(A0,538),mulhiconst(mulloconst(A0,-2118),7921)); /* -3961...4095 */
A0 = add(A0,S1); /* -3961...4350 */
A0 = sub(mulhiconst(A0,538),mulhiconst(mulloconst(A0,-2118),7921)); /* -3994...3996 */
A0 = add(A0,S0); /* -3994...4251 */
A0 = ifnegaddconst(A0,7921); /* 0...7920 */
A1 = add(shiftleftconst(S1,8),sub(S0,A0));
A1 = mulloconst(A1,4625);
/* invalid inputs might need reduction mod 7921 */
A1 = ifgesubconst(A1,7921);
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
B0 = _mm256_unpacklo_epi16(A0,A1);
B1 = _mm256_unpackhi_epi16(A0,A1);
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
C0 = _mm256_permute2x128_si256(B0,B1,0x20);
C1 = _mm256_permute2x128_si256(B0,B1,0x31);
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
_mm256_storeu_si256((__m256i *) (&R4[2*i]),C0);
_mm256_storeu_si256((__m256i *) (16+&R4[2*i]),C1);
if (!i) break;
i = -16-((~15)&-i);
}
/* R4 ------> R3: reconstruct mod 126*[89]+[265] */
R3[126] = R4[63];
s -= 0;
i = 47;
for (;;) {
A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]);
A0 = sub(mulhiconst(A0,32),mulhiconst(mulloconst(A0,-736),89)); /* -45...52 */
A0 = ifnegaddconst(A0,89); /* 0...88 */
A1 = signedshiftrightconst(sub(A2,A0),0);
A1 = mulloconst(A1,18409);
/* invalid inputs might need reduction mod 89 */
A1 = ifgesubconst(A1,89);
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
B0 = _mm256_unpacklo_epi16(A0,A1);
B1 = _mm256_unpackhi_epi16(A0,A1);
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
C0 = _mm256_permute2x128_si256(B0,B1,0x20);
C1 = _mm256_permute2x128_si256(B0,B1,0x31);
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
_mm256_storeu_si256((__m256i *) (&R3[2*i]),C0);
_mm256_storeu_si256((__m256i *) (16+&R3[2*i]),C1);
if (!i) break;
i = -16-((~15)&-i);
}
/* R3 ------> R2: reconstruct mod 253*[2414]+[7177] */
i = 0;
s -= 2;
a2 = a0 = R3[126];
a0 = mulhi(a0,-84)-mulhi(mullo(a0,-6950),2414); /* -1228...1207 */
a0 += s[2*i+1]; /* -1228...1462 */
a0 = mulhi(a0,-84)-mulhi(mullo(a0,-6950),2414); /* -1209...1208 */
a0 += s[2*i+0]; /* -1209...1463 */
a0 += 2414&crypto_int16_negative_mask(a0); /* 0...2413 */
a1 = (a2<<15)+(s[2*i+1]<<7)+((s[2*i]-a0)>>1);
a1 = mullo(a1,6407);
/* invalid inputs might need reduction mod 7177 */
a1 -= 7177;
a1 += 7177&crypto_int16_negative_mask(a1);
R2[252] = a0;
R2[253] = a1;
s -= 252;
i = 110;
for (;;) {
A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]);
S0 = _mm256_loadu_si256((__m256i *) (s+2*i));
S1 = _mm256_srli_epi16(S0,8);
S0 &= _mm256_set1_epi16(255);
A0 = sub(mulhiconst(A0,-84),mulhiconst(mulloconst(A0,-6950),2414)); /* -1228...1207 */
A0 = add(A0,S1); /* -1228...1462 */
A0 = sub(mulhiconst(A0,-84),mulhiconst(mulloconst(A0,-6950),2414)); /* -1209...1208 */
A0 = add(A0,S0); /* -1209...1463 */
A0 = ifnegaddconst(A0,2414); /* 0...2413 */
A1 = add(add(shiftleftconst(A2,15),shiftleftconst(S1,7)),signedshiftrightconst(sub(S0,A0),1));
A1 = mulloconst(A1,6407);
/* invalid inputs might need reduction mod 2414 */
A1 = ifgesubconst(A1,2414);
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
B0 = _mm256_unpacklo_epi16(A0,A1);
B1 = _mm256_unpackhi_epi16(A0,A1);
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
C0 = _mm256_permute2x128_si256(B0,B1,0x20);
C1 = _mm256_permute2x128_si256(B0,B1,0x31);
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
_mm256_storeu_si256((__m256i *) (&R2[2*i]),C0);
_mm256_storeu_si256((__m256i *) (16+&R2[2*i]),C1);
if (!i) break;
i = -16-((~15)&-i);
}
/* R2 ------> R1: reconstruct mod 506*[786]+[7177] */
R1[506] = R2[253];
s -= 253;
i = 237;
for (;;) {
A2 = A0 = _mm256_loadu_si256((__m256i *) &R2[i]);
S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s+i)));
A0 = sub(mulhiconst(A0,46),mulhiconst(mulloconst(A0,-21345),786)); /* -393...404 */
A0 = add(A0,S0); /* -393...659 */
A0 = ifnegaddconst(A0,786); /* 0...785 */
A1 = add(shiftleftconst(A2,7),signedshiftrightconst(sub(S0,A0),1));
A1 = mulloconst(A1,-15175);
/* invalid inputs might need reduction mod 786 */
A1 = ifgesubconst(A1,786);
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
B0 = _mm256_unpacklo_epi16(A0,A1);
B1 = _mm256_unpackhi_epi16(A0,A1);
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
C0 = _mm256_permute2x128_si256(B0,B1,0x20);
C1 = _mm256_permute2x128_si256(B0,B1,0x31);
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
_mm256_storeu_si256((__m256i *) (&R1[2*i]),C0);
_mm256_storeu_si256((__m256i *) (16+&R1[2*i]),C1);
if (!i) break;
i = -16-((~15)&-i);
}
/* R1 ------> R0: reconstruct mod 1013*[7177] */
R0[1012] = R1[506]-3588;
s -= 1012;
i = 490;
for (;;) {
A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]);
S0 = _mm256_loadu_si256((__m256i *) (s+2*i));
S1 = _mm256_srli_epi16(S0,8);
S0 &= _mm256_set1_epi16(255);
A0 = sub(mulhiconst(A0,-2610),mulhiconst(mulloconst(A0,-2338),7177)); /* -4241...3588 */
A0 = add(A0,S1); /* -4241...3843 */
A0 = sub(mulhiconst(A0,-2610),mulhiconst(mulloconst(A0,-2338),7177)); /* -3742...3757 */
A0 = add(A0,S0); /* -3742...4012 */
A0 = ifnegaddconst(A0,7177); /* 0...7176 */
A1 = add(shiftleftconst(S1,8),sub(S0,A0));
A1 = mulloconst(A1,12857);
/* invalid inputs might need reduction mod 7177 */
A1 = ifgesubconst(A1,7177);
A0 = subconst(A0,3588);
A1 = subconst(A1,3588);
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */
B0 = _mm256_unpacklo_epi16(A0,A1);
B1 = _mm256_unpackhi_epi16(A0,A1);
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */
C0 = _mm256_permute2x128_si256(B0,B1,0x20);
C1 = _mm256_permute2x128_si256(B0,B1,0x31);
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */
_mm256_storeu_si256((__m256i *) (&R0[2*i]),C0);
_mm256_storeu_si256((__m256i *) (16+&R0[2*i]),C1);
if (!i) break;
i = -16-((~15)&-i);
}
}