-rw-r--r-- 119520 libntruprime-20240825/src/core/multsntrupP/avx/ntt.c raw
// linker define ntt512_7681 // linker define ntt512_10753 // linker define invntt512_7681 // linker define invntt512_10753 #include "ntt.h" // auto-generated; do not edit #include <immintrin.h> #define _mm256_permute2x128_si256_lo(f0,f1) _mm256_permute2x128_si256(f0,f1,0x20) #define _mm256_permute2x128_si256_hi(f0,f1) _mm256_permute2x128_si256(f0,f1,0x31) #define int16x16 __m256i typedef int16_t int16; typedef int32_t int32; static const int16 __attribute((aligned(32))) qdata_7681[] = { #define precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+0) -3593,-3593,-3593,-3593,-3625,-3625,-3625,-3625,-3593,-3593,-3593,-3593,-3625,-3625,-3625,-3625, #define precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+16) -3777,-3777,-3777,-3777,3182,3182,3182,3182,-3777,-3777,-3777,-3777,3182,3182,3182,3182, #define precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+32) -3593,-3593,-3593,-3593,-3182,-3182,-3182,-3182,-3593,-3593,-3593,-3593,-3182,-3182,-3182,-3182, #define precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+48) 3777,3777,3777,3777,3625,3625,3625,3625,3777,3777,3777,3777,3625,3625,3625,3625, #define precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+64) -3593,-3593,-3593,-3593,2194,2194,2194,2194,-3593,-3593,-3593,-3593,2194,2194,2194,2194, #define precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+80) -3625,-3625,-3625,-3625,-1100,-1100,-1100,-1100,-3625,-3625,-3625,-3625,-1100,-1100,-1100,-1100, #define precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+96) -3593,-3593,-3593,-3593,3696,3696,3696,3696,-3593,-3593,-3593,-3593,3696,3696,3696,3696, #define precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+112) -3182,-3182,-3182,-3182,-2456,-2456,-2456,-2456,-3182,-3182,-3182,-3182,-2456,-2456,-2456,-2456, #define precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+128) -3593,1701,2194,834,-3625,2319,-1100,121,-3593,1701,2194,834,-3625,2319,-1100,121, #define precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+144) -3777,1414,2456,2495,3182,2876,-3696,2250,-3777,1414,2456,2495,3182,2876,-3696,2250, #define precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+160) -3593,-2250,3696,-2876,-3182,-2495,-2456,-1414,-3593,-2250,3696,-2876,-3182,-2495,-2456,-1414, #define precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+176) 3777,-121,1100,-2319,3625,-834,-2194,-1701,3777,-121,1100,-2319,3625,-834,-2194,-1701, #define precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+192) -3593,3364,1701,-1599,2194,2557,834,-2816,-3593,3364,1701,-1599,2194,2557,834,-2816, #define precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+208) -3625,617,2319,2006,-1100,-1296,121,1986,-3625,617,2319,2006,-1100,-1296,121,1986, #define precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+224) -3593,2237,-2250,-1483,3696,3706,-2876,1921,-3593,2237,-2250,-1483,3696,3706,-2876,1921, #define precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+240) -3182,2088,-2495,-1525,-2456,1993,-1414,2830,-3182,2088,-2495,-1525,-2456,1993,-1414,2830, #define precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+256) -3593,514,3364,438,1701,2555,-1599,-1738,2194,103,2557,1881,834,-549,-2816,638, #define precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+272) -3625,-1399,617,-1760,2319,2535,2006,3266,-1100,-1431,-1296,3174,121,3153,1986,-810, #define precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+288) -3777,2956,-2830,-679,1414,2440,-1993,-3689,2456,2804,1525,3555,2495,1535,-2088,-7, #define precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+304) 3182,-1321,-1921,-1305,2876,-3772,-3706,3600,-3696,-2043,1483,-396,2250,-2310,-2237,1887, #define precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+320) -3593,-1887,2237,2310,-2250,396,-1483,2043,3696,-3600,3706,3772,-2876,1305,1921,1321, #define precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+336) -3182,7,2088,-1535,-2495,-3555,-1525,-2804,-2456,3689,1993,-2440,-1414,679,2830,-2956, #define precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+352) 3777,810,-1986,-3153,-121,-3174,1296,1431,1100,-3266,-2006,-2535,-2319,1760,-617,1399, #define precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+368) 3625,-638,2816,549,-834,-1881,-2557,-103,-2194,1738,1599,-2555,-1701,-438,-3364,-514, #define precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+384) -3593,-1532,514,-373,3364,-3816,438,-3456,1701,783,2555,2883,-1599,727,-1738,-2385, #define precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+400) 2194,-2160,103,-2391,2557,2762,1881,-2426,834,3310,-549,-1350,-2816,1386,638,-194, #define precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+416) -3625,404,-1399,-3692,617,-2764,-1760,-1054,2319,1799,2535,-3588,2006,1533,3266,2113, #define precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+432) -1100,-2579,-1431,-1756,-1296,1598,3174,-2,121,-3480,3153,-2572,1986,2743,-810,2919, #define precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+448) -3593,2789,-1887,-921,2237,-1497,2310,-2133,-2250,-915,396,1390,-1483,3135,2043,-859, #define precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+464) 3696,2732,-3600,-1464,3706,2224,3772,-2665,-2876,1698,1305,2835,1921,730,1321,486, #define precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+480) -3182,3417,7,-3428,2088,-3145,-1535,1168,-2495,-3831,-3555,-3750,-1525,660,-2804,2649, #define precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+496) -2456,3405,3689,-1521,1993,1681,-2440,1056,-1414,1166,679,-2233,2830,2175,-2956,-1919, #define precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+512) -3593,-1404,-1532,451,514,-402,-373,1278,3364,-509,-3816,-3770,438,-2345,-3456,-226, #define precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+528) 1701,-1689,783,-1509,2555,2963,2883,1242,-1599,1669,727,2719,-1738,642,-2385,-436, #define precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+544) 2194,3335,-2160,1779,103,3745,-2391,17,2557,2812,2762,-1144,1881,83,-2426,-1181, #define precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+560) 834,-1519,3310,3568,-549,-796,-1350,2072,-2816,-2460,1386,2891,638,-2083,-194,-715, #define precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+576) -3593,-402,-3816,-226,2555,1669,-2385,1779,2557,83,3310,2072,638,1012,-3692,1295, #define precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+592) 2319,-3208,1533,-2071,-1431,-2005,-2,1586,1986,-293,1919,-929,-679,777,-1681,-3461, #define precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+608) 2456,3366,3750,-1203,1535,-3657,-3417,-1712,-1921,2515,2665,-1070,3600,2532,-3135,-2589, #define precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+624) 2250,-2258,921,-658,-514,509,3456,1509,1599,-642,2160,-17,-1881,1519,1350,-2891, #define precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+640) -3593,-3434,-1497,893,396,-2422,-859,2965,3706,-2339,1698,-2937,1321,-670,-3428,-3163, #define precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+656) -2495,-1072,660,1084,3689,-179,1056,-1338,2830,2786,-2919,-3677,-3153,-151,-1598,3334, #define precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+672) 1100,-3314,3588,2262,1760,-2230,-404,2083,2816,-3568,2426,-2812,-103,436,-727,-2963, #define precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+688) -1701,3770,373,1404,1887,-1649,2133,-826,1483,434,-2732,3287,-3772,-2378,-2835,3723, #define precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+704) -3593,658,2789,370,-1887,-3434,-921,-3752,2237,1649,-1497,2258,2310,3581,-2133,893, #define precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+720) -2250,3794,-915,826,396,2589,1390,592,-1483,-2422,3135,3214,2043,-434,-859,-2532, #define precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+736) 3696,1121,2732,2965,-3600,2998,-1464,-3287,3706,1070,2224,-589,3772,-2339,-2665,2070, #define precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+752) -2876,2378,1698,-2515,1305,-2815,2835,-2937,1921,-1348,730,-3723,1321,1712,486,2130, #define q_x16 *(const int16x16 *)(qdata+768) 7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681, #define qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+784) -9,-9,-9,-9,-16425,-16425,-16425,-16425,-9,-9,-9,-9,-16425,-16425,-16425,-16425, #define qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+800) -28865,-28865,-28865,-28865,10350,10350,10350,10350,-28865,-28865,-28865,-28865,10350,10350,10350,10350, #define qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+816) -9,-9,-9,-9,-10350,-10350,-10350,-10350,-9,-9,-9,-9,-10350,-10350,-10350,-10350, #define qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+832) 28865,28865,28865,28865,16425,16425,16425,16425,28865,28865,28865,28865,16425,16425,16425,16425, #define qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+848) -9,-9,-9,-9,-4974,-4974,-4974,-4974,-9,-9,-9,-9,-4974,-4974,-4974,-4974, #define qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+864) -16425,-16425,-16425,-16425,-7244,-7244,-7244,-7244,-16425,-16425,-16425,-16425,-7244,-7244,-7244,-7244, #define qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+880) -9,-9,-9,-9,-4496,-4496,-4496,-4496,-9,-9,-9,-9,-4496,-4496,-4496,-4496, #define qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+896) -10350,-10350,-10350,-10350,-14744,-14744,-14744,-14744,-10350,-10350,-10350,-10350,-14744,-14744,-14744,-14744, #define qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+912) -9,-20315,-4974,18242,-16425,18191,-7244,-11655,-9,-20315,-4974,18242,-16425,18191,-7244,-11655, #define qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+928) -28865,20870,14744,-22593,10350,828,4496,23754,-28865,20870,14744,-22593,10350,828,4496,23754, #define qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+944) -9,-23754,-4496,-828,-10350,22593,-14744,-20870,-9,-23754,-4496,-828,-10350,22593,-14744,-20870, #define qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+960) 28865,11655,7244,-18191,16425,-18242,4974,20315,28865,11655,7244,-18191,16425,-18242,4974,20315, #define qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+976) -9,-10972,-20315,23489,-4974,25597,18242,-2816,-9,-10972,-20315,23489,-4974,25597,18242,-2816, #define qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+992) -16425,-19351,18191,-3114,-7244,-9488,-11655,19394,-16425,-19351,18191,-3114,-7244,-9488,-11655,19394, #define qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+1008) -9,-7491,-23754,-15307,-4496,-15750,-828,-5759,-9,-7491,-23754,-15307,-4496,-15750,-828,-5759, #define qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1024) -10350,22568,22593,-20469,-14744,31177,-20870,26382,-10350,22568,22593,-20469,-14744,31177,-20870,26382, #define qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1040) -9,-14846,-10972,-21066,-20315,-24581,23489,-23242,-4974,-4505,25597,-26279,18242,21467,-2816,15998, #define qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1056) -16425,-4983,-19351,14624,18191,-2073,-3114,20674,-7244,-21399,-9488,6246,-11655,-29103,19394,-5930, #define qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1072) -28865,-23668,-26382,-28839,20870,6536,-31177,16279,14744,29428,20469,29667,-22593,9215,-22568,-11783, #define qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1088) 10350,-14121,5759,-5913,828,-1724,15750,11792,4496,25093,15307,26228,23754,-21766,7491,-6817, #define qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1104) -9,6817,-7491,21766,-23754,-26228,-15307,-25093,-4496,-11792,-15750,1724,-828,5913,-5759,14121, #define qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1120) -10350,11783,22568,-9215,22593,-29667,-20469,-29428,-14744,-16279,31177,-6536,-20870,28839,26382,23668, #define qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1136) 28865,5930,-19394,29103,11655,-6246,9488,21399,7244,-20674,3114,2073,-18191,-14624,19351,4983, #define qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1152) 16425,-15998,2816,-21467,-18242,26279,-25597,4505,4974,23242,-23489,24581,20315,21066,10972,14846, #define qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1168) -9,-32252,-14846,-19317,-10972,8472,-21066,-3456,-20315,16655,-24581,12611,23489,-12073,-23242,29871, #define qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1184) -4974,6032,-4505,10409,25597,24266,-26279,17030,18242,10478,21467,11962,-2816,-26262,15998,-17602, #define qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1200) -16425,-22124,-4983,-26220,-19351,-8908,14624,32738,18191,13575,-2073,27132,-3114,24573,20674,27201, #define qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1216) -7244,12269,-21399,-16092,-9488,-15810,6246,15358,-11655,-15768,-29103,24052,19394,-26441,-5930,-1689, #define qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1232) -9,13541,6817,-5529,-7491,26663,21766,-4693,-23754,13933,-26228,8558,-15307,-21953,-25093,-22875, #define qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1248) -4496,-7508,-11792,-30136,-15750,26800,1724,17303,-828,2722,5913,-12013,-5759,30426,14121,3558, #define qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1264) -10350,-24743,11783,-21860,22568,-32329,-9215,9360,22593,-7415,-29667,25946,-20469,-21868,-29428,-25511, #define qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1280) -14744,1869,-16279,14351,31177,2193,-6536,17440,-20870,24718,28839,-23225,26382,9855,23668,-9599, #define qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1296) -9,-32124,-32252,10179,-14846,6766,-19317,16638,-10972,-23549,8472,-17082,-21066,-15145,-3456,31518, #define qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1312) -20315,-6297,16655,-12261,-24581,-11885,12611,30938,23489,28805,-12073,26783,-23242,-14718,29871,5708, #define qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1328) -4974,15111,6032,-29453,-4505,12449,10409,529,25597,-32004,24266,2952,-26279,18003,17030,24931, #define qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1344) 18242,-1007,10478,-4624,21467,17636,11962,14360,-2816,15972,-26262,16715,15998,4573,-17602,-14539, #define qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1360) -9,6766,8472,31518,-24581,28805,29871,-29453,25597,18003,10478,14360,15998,27636,-26220,17167, #define qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1376) 18191,-7304,24573,-22039,-21399,-4565,15358,10802,19394,21723,9599,-9633,-28839,-2807,-2193,-30597, #define qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1392) 14744,-26330,-25946,-2739,9215,32695,24743,-26288,5759,20435,-17303,24530,11792,20964,21953,23523, #define qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1408) 23754,-27858,5529,6510,14846,23549,3456,12261,-23489,14718,-6032,-529,26279,1007,-11962,-16715, #define qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1424) -9,24214,26663,23933,-26228,-13686,-22875,-27243,-15750,4317,2722,8839,14121,-32414,-21860,-25179, #define qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1440) 22593,-25648,-21868,-964,-16279,-1715,17440,-14650,26382,-28958,1689,-10333,29103,-20119,15810,22790, #define qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1456) 7244,20238,-27132,-2858,-14624,19274,22124,-4573,2816,4624,-17030,32004,4505,-5708,12073,11885, #define qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1472) 20315,17082,19317,32124,-6817,14223,4693,-14138,15307,9650,7508,-9513,-1724,-23882,12013,-15221, #define qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1488) -9,-6510,13541,-23182,6817,24214,-5529,-24232,-7491,-14223,26663,27858,21766,26621,-4693,23933, #define qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1504) -23754,29394,13933,14138,-26228,-23523,8558,-23984,-15307,-13686,-21953,26766,-25093,-9650,-22875,-20964, #define qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1520) -4496,-22943,-7508,-27243,-11792,-18506,-30136,9513,-15750,-24530,26800,947,1724,4317,17303,29718, #define qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1536) -828,23882,2722,-20435,5913,-10495,-12013,8839,-5759,-3396,30426,15221,14121,26288,3558,27730, #define qinvscaledzeta_x16_4_1 *(const int16x16 *)(qdata+1552) -28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865, #define qinvscaledzeta_x16_4_3 *(const int16x16 *)(qdata+1568) 28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865, #define qinvscaledzeta_x16_8_1 *(const int16x16 *)(qdata+1584) -16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425, #define qinvscaledzeta_x16_8_7 *(const int16x16 *)(qdata+1600) -10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350, #define qround32_x16 *(const int16x16 *)(qdata+1616) 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, #define scaledzeta_x16_4_1 *(const int16x16 *)(qdata+1632) -3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777, #define scaledzeta_x16_4_3 *(const int16x16 *)(qdata+1648) 3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777, #define scaledzeta_x16_8_1 *(const int16x16 *)(qdata+1664) -3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625, #define scaledzeta_x16_8_7 *(const int16x16 *)(qdata+1680) -3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182, } ; static const int16 __attribute((aligned(32))) qdata_10753[] = { // precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018,1018,1018,1018,3688,3688,3688,3688,1018,1018,1018,1018,3688,3688,3688,3688, // precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -223,-223,-223,-223,-4188,-4188,-4188,-4188,-223,-223,-223,-223,-4188,-4188,-4188,-4188, // precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018,1018,1018,1018,4188,4188,4188,4188,1018,1018,1018,1018,4188,4188,4188,4188, // precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 223,223,223,223,-3688,-3688,-3688,-3688,223,223,223,223,-3688,-3688,-3688,-3688, // precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018,1018,1018,1018,-376,-376,-376,-376,1018,1018,1018,1018,-376,-376,-376,-376, // precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 3688,3688,3688,3688,-3686,-3686,-3686,-3686,3688,3688,3688,3688,-3686,-3686,-3686,-3686, // precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018,1018,1018,1018,-2413,-2413,-2413,-2413,1018,1018,1018,1018,-2413,-2413,-2413,-2413, // precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 4188,4188,4188,4188,-357,-357,-357,-357,4188,4188,4188,4188,-357,-357,-357,-357, // precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018,-3364,-376,4855,3688,425,-3686,2695,1018,-3364,-376,4855,3688,425,-3686,2695, // precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -223,-3784,357,-2236,-4188,4544,2413,730,-223,-3784,357,-2236,-4188,4544,2413,730, // precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018,-730,-2413,-4544,4188,2236,-357,3784,1018,-730,-2413,-4544,4188,2236,-357,3784, // precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 223,-2695,3686,-425,-3688,-4855,376,3364,223,-2695,3686,-425,-3688,-4855,376,3364, // precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018,-5175,-3364,2503,-376,1341,4855,-4875,1018,-5175,-3364,2503,-376,1341,4855,-4875, // precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 3688,-2629,425,-4347,-3686,3823,2695,-4035,3688,-2629,425,-4347,-3686,3823,2695,-4035, // precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018,5063,-730,341,-2413,-3012,-4544,-5213,1018,5063,-730,341,-2413,-3012,-4544,-5213, // precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 4188,1520,2236,1931,-357,918,3784,4095,4188,1520,2236,1931,-357,918,3784,4095, // precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,3085,-5175,2982,-3364,-4744,2503,-4129,-376,-2576,1341,-193,4855,3062,-4875,4, // precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 3688,2388,-2629,-4513,425,4742,-4347,2935,-3686,-544,3823,-2178,2695,847,-4035,268, // precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -223,-1299,-4095,-1287,-3784,-4876,-918,3091,357,-4189,-1931,4616,-2236,2984,-1520,-3550, // precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -4188,-1009,5213,-205,4544,-4102,3012,2790,2413,-1085,-341,-2565,730,-4379,-5063,-1284, // precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,1284,5063,4379,-730,2565,341,1085,-2413,-2790,-3012,4102,-4544,205,-5213,1009, // precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 4188,3550,1520,-2984,2236,-4616,1931,4189,-357,-3091,918,4876,3784,1287,4095,1299, // precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 223,-268,4035,-847,-2695,2178,-3823,544,3686,-2935,4347,-4742,-425,4513,2629,-2388, // precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -3688,-4,4875,-3062,-4855,193,-1341,2576,376,4129,-2503,4744,3364,-2982,5175,-3085, // precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,5116,3085,-3615,-5175,400,2982,3198,-3364,2234,-4744,-4828,2503,326,-4129,-512, // precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -376,1068,-2576,-4580,1341,3169,-193,-2998,4855,-635,3062,-4808,-4875,-2740,4,675, // precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 3688,-1324,2388,5114,-2629,5294,-4513,-794,425,-864,4742,-886,-4347,336,2935,-2045, // precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -3686,-3715,-544,4977,3823,-2737,-2178,3441,2695,467,847,454,-4035,-779,268,2213, // precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,1615,1284,2206,5063,5064,4379,472,-730,-5341,2565,-4286,341,2981,1085,-1268, // precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -2413,-3057,-2790,-2884,-3012,-1356,4102,-3337,-4544,5023,205,-636,-5213,909,1009,-2973, // precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 4188,2271,3550,-1572,1520,1841,-2984,970,2236,-4734,-4616,578,1931,-116,4189,1586, // precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -357,-2774,-3091,-1006,918,-5156,4876,4123,3784,-567,1287,151,4095,1458,1299,2684, // precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,-3260,5116,-1722,3085,5120,-3615,3760,-5175,73,400,4254,2982,2788,3198,-2657, // precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -3364,569,2234,1930,-4744,-2279,-4828,5215,2503,-4403,326,1639,-4129,5068,-512,-5015, // precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -376,-4859,1068,-40,-2576,4003,-4580,-4621,1341,2487,3169,-2374,-193,2625,-2998,4784, // precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 4855,825,-635,2118,3062,-2813,-4808,-4250,-4875,-2113,-2740,-4408,4,-1893,675,458, // precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,5120,400,-2657,-4744,-4403,-512,-40,1341,2625,-635,-4250,4,-3360,5114,-5313, // precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 425,-2151,336,-2662,-544,5334,3441,2117,-4035,2205,-2684,-3570,-1287,-4973,5156,2419, // precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 357,1204,-578,1635,2984,-1111,-2271,4359,5213,-2449,3337,3453,2790,554,-2981,-1409, // precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 730,-279,-2206,3524,-3085,-73,-3198,-1930,-2503,-5068,-1068,4621,193,-825,4808,4408, // precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,4428,5064,-4000,2565,573,-1268,3125,-3012,-4144,5023,1927,1009,-2139,-1572,3535, // precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 2236,663,-116,4967,-3091,-854,4123,1160,4095,-1349,-2213,1782,-847,2062,2737,624, // precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 3686,-2283,886,4889,4513,-4601,1324,1893,4875,-2118,2998,-2487,2576,5015,-326,2279, // precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 3364,-4254,3615,3260,-1284,-1381,-472,-3891,-341,2087,3057,4720,-4102,3410,636,1689, // precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018,-3524,1615,5268,1284,4428,2206,-834,5063,1381,5064,279,4379,2439,472,-4000, // precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -730,-2015,-5341,3891,2565,1409,-4286,2605,341,573,2981,5356,1085,-2087,-1268,-554, // precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -2413,3135,-3057,3125,-2790,-778,-2884,-4720,-3012,-3453,-1356,-355,4102,-4144,-3337,-152, // precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -4544,-3410,5023,2449,205,-97,-636,1927,-5213,2624,909,-1689,1009,-4359,-2973,-3419, // q_x16 10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753, // qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6,-6,-6,-6,-408,-408,-408,-408,-6,-6,-6,-6,-408,-408,-408,-408, // qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -27359,-27359,-27359,-27359,1956,1956,1956,1956,-27359,-27359,-27359,-27359,1956,1956,1956,1956, // qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6,-6,-6,-6,-1956,-1956,-1956,-1956,-6,-6,-6,-6,-1956,-1956,-1956,-1956, // qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 27359,27359,27359,27359,408,408,408,408,27359,27359,27359,27359,408,408,408,408, // qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6,-6,-6,-6,-20856,-20856,-20856,-20856,-6,-6,-6,-6,-20856,-20856,-20856,-20856, // qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -408,-408,-408,-408,-21094,-21094,-21094,-21094,-408,-408,-408,-408,-21094,-21094,-21094,-21094, // qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6,-6,-6,-6,-10093,-10093,-10093,-10093,-6,-6,-6,-6,-10093,-10093,-10093,-10093, // qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -1956,-1956,-1956,-1956,-28517,-28517,-28517,-28517,-1956,-1956,-1956,-1956,-28517,-28517,-28517,-28517, // qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6,-9508,-20856,-29449,-408,18345,-21094,-7033,-6,-9508,-20856,-29449,-408,18345,-21094,-7033, // qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -27359,-16072,28517,-12476,1956,-28224,10093,16090,-27359,-16072,28517,-12476,1956,-28224,10093,16090, // qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6,-16090,-10093,28224,-1956,12476,-28517,16072,-6,-16090,-10093,28224,-1956,12476,-28517,16072, // qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 27359,7033,21094,-18345,408,29449,20856,9508,27359,7033,21094,-18345,408,29449,20856,9508, // qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6,-3639,-9508,25543,-20856,829,-29449,-17675,-6,-3639,-9508,25543,-20856,829,-29449,-17675, // qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -408,18363,18345,7429,-21094,-10001,-7033,-4547,-408,18363,18345,7429,-21094,-10001,-7033,-4547, // qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6,28103,-16090,3925,-10093,7228,28224,11683,-6,28103,-16090,3925,-10093,7228,28224,11683, // qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -1956,-23056,12476,14731,-28517,26518,16072,14847,-1956,-23056,12476,14731,-28517,26518,16072,14847, // qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,-5619,-3639,-12378,-9508,15736,25543,23007,-20856,-27152,829,-22209,-29449,-20490,-17675,22532, // qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -408,16724,18363,22623,18345,5766,7429,-31369,-21094,15840,-10001,19326,-7033,3407,-4547,2316, // qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -27359,6381,-14847,8441,-16072,-6924,-26518,-4589,28517,12707,-14731,-15864,-12476,31656,23056,24098, // qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 1956,-31217,-11683,-24269,-28224,-5126,-7228,20198,10093,-573,-3925,-14341,16090,23781,-28103,-23812, // qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,23812,28103,-23781,-16090,14341,3925,573,-10093,-20198,7228,5126,28224,24269,11683,31217, // qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -1956,-24098,-23056,-31656,12476,15864,14731,-12707,-28517,4589,26518,6924,16072,-8441,14847,-6381, // qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 27359,-2316,4547,-3407,7033,-19326,10001,-15840,21094,31369,-7429,-5766,-18345,-22623,-18363,-16724, // qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 408,-22532,17675,20490,29449,22209,-829,27152,20856,-23007,-25543,-15736,9508,12378,3639,5619, // qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,-17412,-5619,2017,-3639,24976,-12378,24702,-9508,-31558,15736,1316,25543,-31418,23007,-512, // qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -20856,-13268,-27152,22044,829,8801,-22209,-12214,-29449,11141,-20490,-17096,-17675,32076,22532,17571, // qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -408,13012,16724,4090,18363,-30546,22623,16614,18345,-17248,5766,22666,7429,-7856,-31369,31235, // qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -21094,28541,15840,-30351,-10001,-177,19326,-31887,-7033,25555,3407,-31290,-4547,-13579,2316,-2395, // qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,4175,23812,7326,28103,17352,-23781,-28200,-16090,11555,14341,6978,3925,-1627,573,780, // qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -10093,32271,-20198,7356,7228,29364,5126,27895,28224,-609,24269,21892,11683,-7795,31217,-18845, // qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -1956,29407,-24098,-7716,-23056,-719,-31656,-8246,12476,-26238,15864,11842,14731,1932,-12707,-11726, // qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -28517,4394,4589,2066,26518,-11300,6924,-24037,16072,969,-8441,14999,14847,-11854,-6381,-19844, // qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,-13500,-17412,32070,-5619,5120,2017,11952,-3639,1609,24976,9374,-12378,-23836,24702,-8289, // qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -9508,-22471,-31558,25482,15736,-8935,1316,32351,25543,19661,-31418,8295,23007,-25652,-512,-19863, // qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -20856,6917,-13268,-28712,-27152,20899,22044,4083,829,951,8801,29370,-22209,24641,-12214,12976, // qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -29449,-22215,11141,-29626,-20490,30467,-17096,13158,-17675,-24129,32076,7880,22532,-30053,17571,-8758, // qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,5120,24976,-8289,15736,19661,-512,-28712,829,24641,11141,13158,22532,13024,4090,-27329, // qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 18345,-8807,-7856,-20070,15840,-1834,-31887,-18875,-4547,18077,19844,-23026,8441,-12653,11300,11123, // qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 28517,31924,-11842,-14237,31656,16809,-29407,-5369,-11683,-16273,-27895,-29827,20198,7722,1627,9343, // qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 16090,-15127,-7326,-6716,5619,-1609,-24702,-25482,-25543,25652,13268,-4083,22209,22215,17096,-7880, // qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,-26292,17352,12384,14341,61,780,23093,7228,-12336,-609,-7801,31217,-6747,-7716,6095, // qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 12476,15511,1932,11623,4589,6314,-24037,-19320,14847,19643,2395,-21770,-3407,-17394,177,-23952, // qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 21094,-31467,-22666,-1767,-22623,-14329,-13012,30053,17675,29626,12214,-951,27152,19863,31418,8935, // qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 9508,-9374,-2017,13500,-23812,-29541,28200,20173,-3925,-24025,-32271,-19856,-5126,-26286,-21892,-4967, // qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6,6716,4175,-13164,23812,-26292,7326,-12098,28103,29541,17352,15127,-23781,-7289,-28200,12384, // qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -16090,-29151,11555,-20173,14341,-9343,6978,-22483,3925,61,-1627,23788,573,24025,780,-7722, // qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -10093,-18881,32271,23093,-20198,-24330,7356,19856,7228,29827,29364,15517,5126,-12336,27895,-4248, // qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 28224,26286,-609,16273,24269,-5729,21892,-7801,11683,-30144,-7795,4967,31217,5369,-18845,-8027, // qinvscaledzeta_x16_4_1 -27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359, // qinvscaledzeta_x16_4_3 27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359, // qinvscaledzeta_x16_8_1 -408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408, // qinvscaledzeta_x16_8_7 -1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956, // qround32_x16 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // scaledzeta_x16_4_1 -223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223, // scaledzeta_x16_4_3 223,223,223,223,223,223,223,223,223,223,223,223,223,223,223,223, // scaledzeta_x16_8_1 3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688, // scaledzeta_x16_8_7 4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188, } ; static inline int16x16 add_x16(int16x16 a,int16x16 b) { return _mm256_add_epi16(a,b); } static inline int16x16 sub_x16(int16x16 a,int16x16 b) { return _mm256_sub_epi16(a,b); } static inline int16x16 mulmod_scaled_x16(int16x16 x,int16x16 y,int16x16 yqinv,const int16 *qdata) { int16x16 b = _mm256_mulhi_epi16(x,y); int16x16 d = _mm256_mullo_epi16(x,yqinv); int16x16 e = _mm256_mulhi_epi16(d,q_x16); return sub_x16(b,e); } static inline int16x16 reduce_x16(int16x16 x,const int16 *qdata) { int16x16 y = _mm256_mulhrs_epi16(x,qround32_x16); y = _mm256_mullo_epi16(y,q_x16); return sub_x16(x,y); } // ----- codegen pass 1 // // startntt 512 // startbatch 512 // // ----- PRECONDITIONS // physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // transform size 512 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] // // transforms per batch 1 // // batch indexing [] // // total batch size 512 // // // modulus x^512-1 pos 0:512 q 7681,10753 bound 512*(5629,5800) // assertranges ... // // // ----- LAYER 1 // // // butterfly(0,256,1,256,1,0) // butterfly 0 256 1 256 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // ----- POSTCONDITIONS AFTER LAYER 1 // // transform size 512 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] // // transforms per batch 1 // // batch indexing [] // // total batch size 512 // // // modulus x^256-1 pos 0:256 q 7681,10753 bound 256*(11258,11600) // assertranges ... // // // modulus x^256+1 pos 256:512 q 7681,10753 bound 256*(11258,11600) // assertranges ... // // // ----- LAYER 2 // // // reduce_ifreverse(0,64,1) // reduce_ifreverse 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // reduce_ifreverse(256,320,1) // reduce_ifreverse 256 320 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(0,128,1,128,1,0) // butterfly 0 128 1 128 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(256,384,1,128,4,1) // butterfly 256 384 1 128 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // ----- POSTCONDITIONS AFTER LAYER 2 // // transform size 512 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] // // transforms per batch 1 // // batch indexing [] // // total batch size 512 // // // modulus x^128-1 pos 0:128 q 7681,10753 bound 128*(22516,23200) // assertranges ... // // // modulus x^128+1 pos 128:256 q 7681,10753 bound 128*(22516,23200) // assertranges ... // // // modulus x^128-zeta4 pos 256:384 q 7681,10753 bound 128*(15747,17016) // assertranges ... // // // modulus x^128+zeta4 pos 384:512 q 7681,10753 bound 128*(15747,17016) // assertranges ... // // // ----- LAYER 3 // // // reduce_ifforward(64,128,1) // reduce_ifforward 64 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(0,64,1,64,1,0) // butterfly 0 64 1 64 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(128,192,1,64,4,1) // butterfly 128 192 1 64 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(256,320,1,64,8,1) // butterfly 256 320 1 64 8 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(384,448,1,64,8,-1) // butterfly 384 448 1 64 8 7 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // reduce(0,64,1) // reduce 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(64,128,1,128,1) // twist 64 128 1 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(128,192,1,256,1) // twist 128 192 1 256 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(192,256,1,256,-1) // twist 192 256 1 256 255 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(256,320,1,512,1) // twist 256 320 1 512 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(320,384,1,512,5) // twist 320 384 1 512 5 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(384,448,1,512,-1) // twist 384 448 1 512 511 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(448,512,1,512,-5) // twist 448 512 1 512 507 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // physical_permute(3,6) // physical_permute (3, 6) (0, 1, 2, 3, 4, 5, 6, 7, 8) () (0, 1, 2, 6, 4, 5, 3, 7, 8) () // // // fold(256) // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7, 8) () // physical_map (0, 1, 2, 6, 4, 5, 3, 7) (8,) // // // fold(128) // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7) (8,) // physical_map (0, 1, 2, 6, 4, 5, 3) (7, 8) // // // fold(64) // physical_unmap (0, 1, 2, 6, 4, 5, 3) (7, 8) // physical_map (0, 1, 2, 6, 4, 5) (3, 7, 8) // // // nextbatch() // stopbatch 512 // startbatch 512 // // // halfbatch() // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7, 8) // stopbatch 512 // doublereps // startbatch 256 // physical_map (0, 1, 2, 6, 4, 5) (3, 7) // // // halfbatch() // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7) // stopbatch 256 // doublereps // startbatch 128 // physical_map (0, 1, 2, 6, 4, 5) (3,) // // // ----- POSTCONDITIONS AFTER LAYER 3 // // transform size 64 // // transform indexing [0, 1, 2, 6, 4, 5] // // transforms per batch 2 // // batch indexing [3] // // total batch size 128 // // // modulus x^64-1 pos 0:64 q 7681,10753 bound 1*(5629,5827) 1*(5629,7613) 1*(5629,7666) 1*(5629,7264) 1*(5629,7639) 1*(5629,7591) 1*(5629,7291) 1*(5629,7204) ... // assertranges ... // // // ----- LAYER 4 // // // butterfly(0,32,1,32,1,0) // butterfly 0 32 1 32 1 0 (0, 1, 2, 6, 4, 5) (3,) // // // ----- POSTCONDITIONS AFTER LAYER 4 // // transform size 64 // // transform indexing [0, 1, 2, 6, 4, 5] // // transforms per batch 2 // // batch indexing [3] // // total batch size 128 // // // modulus x^32-1 pos 0:32 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ... // assertranges ... // // // modulus x^32+1 pos 32:64 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ... // assertranges ... // // // ----- LAYER 5 // // // butterfly(0,16,1,16,1,0) // butterfly 0 16 1 16 1 0 (0, 1, 2, 6, 4, 5) (3,) // // // butterfly(32,48,1,16,4,1) // butterfly 32 48 1 16 4 1 (0, 1, 2, 6, 4, 5) (3,) // // // reduce(0,16,1) // reduce 0 16 1 (0, 1, 2, 6, 4, 5) (3,) // // // twist(16,32,1,32,1) // twist 16 32 1 32 1 (0, 1, 2, 6, 4, 5) (3,) // // // twist(32,48,1,64,1) // twist 32 48 1 64 1 (0, 1, 2, 6, 4, 5) (3,) // // // twist(48,64,1,64,-1) // twist 48 64 1 64 63 (0, 1, 2, 6, 4, 5) (3,) // // // physical_permute(0,1,2,5) // physical_permute (0, 1, 2, 5) (0, 1, 2, 6, 4, 5) (3,) (1, 2, 5, 6, 4, 0) (3,) // // // fold(32) // physical_unmap (1, 2, 5, 6, 4, 0) (3,) // physical_map (1, 2, 5, 6, 4) (0, 3) // // // fold(16) // physical_unmap (1, 2, 5, 6, 4) (0, 3) // physical_map (1, 2, 5, 6) (0, 3, 4) // // // ----- POSTCONDITIONS AFTER LAYER 5 // // transform size 16 // // transform indexing [1, 2, 5, 6] // // transforms per batch 8 // // batch indexing [0, 3, 4] // // total batch size 128 // // // modulus x^16-1 pos 0:16 q 7681,10753 bound 1*(5629,5800) 1*(5629,6967) 1*(5629,6418) 1*(5629,7585) 1*(5629,7017) 1*(5629,6328) 1*(5629,7033) 1*(5629,6943) ... // assertranges ... // // // ----- LAYER 6 // // // butterfly(0,8,1,8,1,0) // butterfly 0 8 1 8 1 0 (1, 2, 5, 6) (0, 3, 4) // // // physical_permute(1,2,4) // physical_permute (1, 2, 4) (1, 2, 5, 6) (0, 3, 4) (2, 4, 5, 6) (0, 3, 1) // // // nextbatch() // stopbatch 128 // startbatch 128 // // // ----- POSTCONDITIONS AFTER LAYER 6 // // transform size 16 // // transform indexing [2, 4, 5, 6] // // transforms per batch 8 // // batch indexing [0, 3, 1] // // total batch size 128 // // // modulus x^8-1 pos 0:8 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555) // assertranges ... // // // modulus x^8+1 pos 8:16 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555) // assertranges ... // // // ----- LAYER 7 // // // butterfly(0,4,1,4,1,0) // butterfly 0 4 1 4 1 0 (2, 4, 5, 6) (0, 3, 1) // // // butterfly(8,12,1,4,4,1) // butterfly 8 12 1 4 4 1 (2, 4, 5, 6) (0, 3, 1) // // // reduce(0,4,1) // reduce 0 4 1 (2, 4, 5, 6) (0, 3, 1) // // // twist(4,8,1,8,1) // twist 4 8 1 8 1 (2, 4, 5, 6) (0, 3, 1) // // // twist(8,12,1,16,1) // twist 8 12 1 16 1 (2, 4, 5, 6) (0, 3, 1) // // // twist(12,16,1,16,-1) // twist 12 16 1 16 15 (2, 4, 5, 6) (0, 3, 1) // // // physical_permute(2,6) // physical_permute (2, 6) (2, 4, 5, 6) (0, 3, 1) (6, 4, 5, 2) (0, 3, 1) // // // fold(8) // physical_unmap (6, 4, 5, 2) (0, 3, 1) // physical_map (6, 4, 5) (0, 1, 2, 3) // // // fold(4) // physical_unmap (6, 4, 5) (0, 1, 2, 3) // physical_map (6, 4) (0, 1, 2, 3, 5) // // // ----- POSTCONDITIONS AFTER LAYER 7 // // transform size 4 // // transform indexing [6, 4] // // transforms per batch 32 // // batch indexing [0, 1, 2, 3, 5] // // total batch size 128 // // // modulus x^4-1 pos 0:4 q 7681,10753 bound 1*(5629,5800) 1*(5629,6938) 1*(5629,6521) 1*(5629,7157) // assertranges ... // // // ----- LAYER 8 // // // butterfly(0,2,1,2,1,0) // butterfly 0 2 1 2 1 0 (6, 4) (0, 1, 2, 3, 5) // // // ----- POSTCONDITIONS AFTER LAYER 8 // // transform size 4 // // transform indexing [6, 4] // // transforms per batch 32 // // batch indexing [0, 1, 2, 3, 5] // // total batch size 128 // // // modulus x^2-1 pos 0:2 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095) // assertranges ... // // // modulus x^2+1 pos 2:4 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095) // assertranges ... // // // ----- LAYER 9 // // // butterfly(0,1,1,1,1,0) // butterfly 0 1 1 1 1 0 (6, 4) (0, 1, 2, 3, 5) // // // butterfly(2,3,1,1,4,1) // butterfly 2 3 1 1 4 1 (6, 4) (0, 1, 2, 3, 5) // // // ----- POSTCONDITIONS AFTER LAYER 9 // // transform size 4 // // transform indexing [6, 4] // // transforms per batch 32 // // batch indexing [0, 1, 2, 3, 5] // // total batch size 128 // // // modulus x^1-1 pos 0:1 q 7681,10753 bound 1*(22516,26416) // assertranges ... // // // modulus x^1+1 pos 1:2 q 7681,10753 bound 1*(22516,26416) // assertranges ... // // // modulus x^1-zeta4 pos 2:3 q 7681,10753 bound 1*(15747,17745) // assertranges ... // // // modulus x^1+zeta4 pos 3:4 q 7681,10753 bound 1*(15747,17745) // assertranges ... // stopbatch 128 // physical_unmap (6, 4) (0, 1, 2, 3, 5) // stopntt 512 // ----- codegen pass 2 // // startntt 512 // startbatch 512 // vector_butterfly 0 256 1 0 // vector_butterfly 128 384 1 0 // vector_butterfly 64 320 1 0 // vector_butterfly 192 448 1 0 // vector_reduce_ifreverse 0 // vector_reduce_ifreverse 256 // vector_butterfly 0 128 1 0 // vector_butterfly 64 192 1 0 // vector_butterfly 256 384 4 1 // vector_butterfly 320 448 4 1 // vector_reduce_ifforward 64 // vector_butterfly 0 64 1 0 // vector_butterfly 128 192 4 1 // vector_butterfly 256 320 8 1 // vector_butterfly 384 448 8 7 // vector_reduce 0 // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // startbatch 512 // vector_butterfly 16 272 1 0 // vector_butterfly 144 400 1 0 // vector_butterfly 80 336 1 0 // vector_butterfly 208 464 1 0 // vector_reduce_ifreverse 16 // vector_reduce_ifreverse 272 // vector_butterfly 16 144 1 0 // vector_butterfly 80 208 1 0 // vector_butterfly 272 400 4 1 // vector_butterfly 336 464 4 1 // vector_reduce_ifforward 80 // vector_butterfly 16 80 1 0 // vector_butterfly 144 208 4 1 // vector_butterfly 272 336 8 1 // vector_butterfly 400 464 8 7 // vector_reduce 16 // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // startbatch 512 // vector_butterfly 32 288 1 0 // vector_butterfly 160 416 1 0 // vector_butterfly 96 352 1 0 // vector_butterfly 224 480 1 0 // vector_reduce_ifreverse 32 // vector_reduce_ifreverse 288 // vector_butterfly 32 160 1 0 // vector_butterfly 96 224 1 0 // vector_butterfly 288 416 4 1 // vector_butterfly 352 480 4 1 // vector_reduce_ifforward 96 // vector_butterfly 32 96 1 0 // vector_butterfly 160 224 4 1 // vector_butterfly 288 352 8 1 // vector_butterfly 416 480 8 7 // vector_reduce 32 // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // startbatch 512 // vector_butterfly 48 304 1 0 // vector_butterfly 176 432 1 0 // vector_butterfly 112 368 1 0 // vector_butterfly 240 496 1 0 // vector_reduce_ifreverse 48 // vector_reduce_ifreverse 304 // vector_butterfly 48 176 1 0 // vector_butterfly 112 240 1 0 // vector_butterfly 304 432 4 1 // vector_butterfly 368 496 4 1 // vector_reduce_ifforward 112 // vector_butterfly 48 112 1 0 // vector_butterfly 176 240 4 1 // vector_butterfly 304 368 8 1 // vector_butterfly 432 496 8 7 // vector_reduce 48 // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // doublereps // doublereps // startbatch 128 // vector_butterfly 0 32 1 0 // vector_butterfly 64 96 1 0 // vector_butterfly 16 48 1 0 // vector_butterfly 80 112 1 0 // vector_butterfly 0 16 1 0 // vector_butterfly 64 80 1 0 // vector_butterfly 32 48 4 1 // vector_butterfly 96 112 4 1 // vector_reduce 0 // vector_reduce 64 // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_butterfly 0 64 1 0 // vector_butterfly 32 96 1 0 // vector_butterfly 16 80 1 0 // vector_butterfly 48 112 1 0 // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // stopbatch 128 // startbatch 128 // vector_butterfly 0 32 1 0 // vector_butterfly 16 48 1 0 // vector_butterfly 64 96 4 1 // vector_butterfly 80 112 4 1 // vector_reduce 0 // vector_reduce 16 // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_butterfly 0 16 1 0 // vector_butterfly 64 80 1 0 // vector_butterfly 32 48 1 0 // vector_butterfly 96 112 1 0 // vector_butterfly 0 64 1 0 // vector_butterfly 32 96 1 0 // vector_butterfly 16 80 4 1 // vector_butterfly 48 112 4 1 // stopbatch 128 // stopntt 512 // startntt 512 static void ntt512(int16 *f,int reps,const int16 *qdata) { // startbatch 512 for (long long r = 0;r < reps;++r) { // vector_butterfly 0 256 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0)); int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f+256)); int16x16 b0 = add_x16(a0,a16); int16x16 b16 = sub_x16(a0,a16); // vector_butterfly 128 384 1 0 int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f+128)); int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f+384)); int16x16 b8 = add_x16(a8,a24); int16x16 b24 = sub_x16(a8,a24); // vector_butterfly 64 320 1 0 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64)); int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f+320)); int16x16 b4 = add_x16(a4,a20); int16x16 b20 = sub_x16(a4,a20); // vector_butterfly 192 448 1 0 int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f+192)); int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f+448)); int16x16 b12 = add_x16(a12,a28); int16x16 b28 = sub_x16(a12,a28); // vector_reduce_ifreverse 0 // vector_reduce_ifreverse 256 // vector_butterfly 0 128 1 0 int16x16 c0 = add_x16(b0,b8); int16x16 c8 = sub_x16(b0,b8); // vector_butterfly 64 192 1 0 int16x16 c4 = add_x16(b4,b12); int16x16 c12 = sub_x16(b4,b12); // vector_butterfly 256 384 4 1 b24 = mulmod_scaled_x16(b24,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c16 = add_x16(b16,b24); int16x16 c24 = sub_x16(b16,b24); // vector_butterfly 320 448 4 1 b28 = mulmod_scaled_x16(b28,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c20 = add_x16(b20,b28); int16x16 c28 = sub_x16(b20,b28); // vector_reduce_ifforward 64 c4 = reduce_x16(c4,qdata); // vector_butterfly 0 64 1 0 int16x16 d0 = add_x16(c0,c4); int16x16 d4 = sub_x16(c0,c4); // vector_butterfly 128 192 4 1 c12 = mulmod_scaled_x16(c12,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 d8 = add_x16(c8,c12); int16x16 d12 = sub_x16(c8,c12); // vector_butterfly 256 320 8 1 c20 = mulmod_scaled_x16(c20,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); int16x16 d16 = add_x16(c16,c20); int16x16 d20 = sub_x16(c16,c20); // vector_butterfly 384 448 8 7 c28 = mulmod_scaled_x16(c28,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); int16x16 d24 = add_x16(c24,c28); int16x16 d28 = sub_x16(c24,c28); // vector_reduce 0 d0 = reduce_x16(d0,qdata); // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d4 = mulmod_scaled_x16(d4,precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d8 = mulmod_scaled_x16(d8,precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d12 = mulmod_scaled_x16(d12,precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d16 = mulmod_scaled_x16(d16,precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d20 = mulmod_scaled_x16(d20,precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d24 = mulmod_scaled_x16(d24,precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d28 = mulmod_scaled_x16(d28,precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e0 = _mm256_permute2x128_si256_lo(d0,d4); int16x16 e4 = _mm256_permute2x128_si256_hi(d0,d4); // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e8 = _mm256_permute2x128_si256_lo(d8,d12); int16x16 e12 = _mm256_permute2x128_si256_hi(d8,d12); // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e16 = _mm256_permute2x128_si256_lo(d16,d20); int16x16 e20 = _mm256_permute2x128_si256_hi(d16,d20); // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e24 = _mm256_permute2x128_si256_lo(d24,d28); int16x16 e28 = _mm256_permute2x128_si256_hi(d24,d28); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f+0),e0); _mm256_storeu_si256((int16x16 *) (f+64),e4); _mm256_storeu_si256((int16x16 *) (f+128),e8); _mm256_storeu_si256((int16x16 *) (f+192),e12); _mm256_storeu_si256((int16x16 *) (f+256),e16); _mm256_storeu_si256((int16x16 *) (f+320),e20); _mm256_storeu_si256((int16x16 *) (f+384),e24); _mm256_storeu_si256((int16x16 *) (f+448),e28); f += 512; } f -= 512*reps; // startbatch 512 for (long long r = 0;r < reps;++r) { // vector_butterfly 16 272 1 0 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16)); int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f+272)); int16x16 b1 = add_x16(a1,a17); int16x16 b17 = sub_x16(a1,a17); // vector_butterfly 144 400 1 0 int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f+144)); int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f+400)); int16x16 b9 = add_x16(a9,a25); int16x16 b25 = sub_x16(a9,a25); // vector_butterfly 80 336 1 0 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80)); int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f+336)); int16x16 b5 = add_x16(a5,a21); int16x16 b21 = sub_x16(a5,a21); // vector_butterfly 208 464 1 0 int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f+208)); int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f+464)); int16x16 b13 = add_x16(a13,a29); int16x16 b29 = sub_x16(a13,a29); // vector_reduce_ifreverse 16 // vector_reduce_ifreverse 272 // vector_butterfly 16 144 1 0 int16x16 c1 = add_x16(b1,b9); int16x16 c9 = sub_x16(b1,b9); // vector_butterfly 80 208 1 0 int16x16 c5 = add_x16(b5,b13); int16x16 c13 = sub_x16(b5,b13); // vector_butterfly 272 400 4 1 b25 = mulmod_scaled_x16(b25,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c17 = add_x16(b17,b25); int16x16 c25 = sub_x16(b17,b25); // vector_butterfly 336 464 4 1 b29 = mulmod_scaled_x16(b29,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c21 = add_x16(b21,b29); int16x16 c29 = sub_x16(b21,b29); // vector_reduce_ifforward 80 c5 = reduce_x16(c5,qdata); // vector_butterfly 16 80 1 0 int16x16 d1 = add_x16(c1,c5); int16x16 d5 = sub_x16(c1,c5); // vector_butterfly 144 208 4 1 c13 = mulmod_scaled_x16(c13,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 d9 = add_x16(c9,c13); int16x16 d13 = sub_x16(c9,c13); // vector_butterfly 272 336 8 1 c21 = mulmod_scaled_x16(c21,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); int16x16 d17 = add_x16(c17,c21); int16x16 d21 = sub_x16(c17,c21); // vector_butterfly 400 464 8 7 c29 = mulmod_scaled_x16(c29,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); int16x16 d25 = add_x16(c25,c29); int16x16 d29 = sub_x16(c25,c29); // vector_reduce 16 d1 = reduce_x16(d1,qdata); // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d5 = mulmod_scaled_x16(d5,precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d9 = mulmod_scaled_x16(d9,precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d13 = mulmod_scaled_x16(d13,precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d17 = mulmod_scaled_x16(d17,precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d21 = mulmod_scaled_x16(d21,precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d25 = mulmod_scaled_x16(d25,precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d29 = mulmod_scaled_x16(d29,precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e1 = _mm256_permute2x128_si256_lo(d1,d5); int16x16 e5 = _mm256_permute2x128_si256_hi(d1,d5); // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e9 = _mm256_permute2x128_si256_lo(d9,d13); int16x16 e13 = _mm256_permute2x128_si256_hi(d9,d13); // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e17 = _mm256_permute2x128_si256_lo(d17,d21); int16x16 e21 = _mm256_permute2x128_si256_hi(d17,d21); // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e25 = _mm256_permute2x128_si256_lo(d25,d29); int16x16 e29 = _mm256_permute2x128_si256_hi(d25,d29); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f+16),e1); _mm256_storeu_si256((int16x16 *) (f+80),e5); _mm256_storeu_si256((int16x16 *) (f+144),e9); _mm256_storeu_si256((int16x16 *) (f+208),e13); _mm256_storeu_si256((int16x16 *) (f+272),e17); _mm256_storeu_si256((int16x16 *) (f+336),e21); _mm256_storeu_si256((int16x16 *) (f+400),e25); _mm256_storeu_si256((int16x16 *) (f+464),e29); f += 512; } f -= 512*reps; // startbatch 512 for (long long r = 0;r < reps;++r) { // vector_butterfly 32 288 1 0 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32)); int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f+288)); int16x16 b2 = add_x16(a2,a18); int16x16 b18 = sub_x16(a2,a18); // vector_butterfly 160 416 1 0 int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f+160)); int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f+416)); int16x16 b10 = add_x16(a10,a26); int16x16 b26 = sub_x16(a10,a26); // vector_butterfly 96 352 1 0 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96)); int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f+352)); int16x16 b6 = add_x16(a6,a22); int16x16 b22 = sub_x16(a6,a22); // vector_butterfly 224 480 1 0 int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f+224)); int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f+480)); int16x16 b14 = add_x16(a14,a30); int16x16 b30 = sub_x16(a14,a30); // vector_reduce_ifreverse 32 // vector_reduce_ifreverse 288 // vector_butterfly 32 160 1 0 int16x16 c2 = add_x16(b2,b10); int16x16 c10 = sub_x16(b2,b10); // vector_butterfly 96 224 1 0 int16x16 c6 = add_x16(b6,b14); int16x16 c14 = sub_x16(b6,b14); // vector_butterfly 288 416 4 1 b26 = mulmod_scaled_x16(b26,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c18 = add_x16(b18,b26); int16x16 c26 = sub_x16(b18,b26); // vector_butterfly 352 480 4 1 b30 = mulmod_scaled_x16(b30,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c22 = add_x16(b22,b30); int16x16 c30 = sub_x16(b22,b30); // vector_reduce_ifforward 96 c6 = reduce_x16(c6,qdata); // vector_butterfly 32 96 1 0 int16x16 d2 = add_x16(c2,c6); int16x16 d6 = sub_x16(c2,c6); // vector_butterfly 160 224 4 1 c14 = mulmod_scaled_x16(c14,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 d10 = add_x16(c10,c14); int16x16 d14 = sub_x16(c10,c14); // vector_butterfly 288 352 8 1 c22 = mulmod_scaled_x16(c22,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); int16x16 d18 = add_x16(c18,c22); int16x16 d22 = sub_x16(c18,c22); // vector_butterfly 416 480 8 7 c30 = mulmod_scaled_x16(c30,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); int16x16 d26 = add_x16(c26,c30); int16x16 d30 = sub_x16(c26,c30); // vector_reduce 32 d2 = reduce_x16(d2,qdata); // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d6 = mulmod_scaled_x16(d6,precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d10 = mulmod_scaled_x16(d10,precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d14 = mulmod_scaled_x16(d14,precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d18 = mulmod_scaled_x16(d18,precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d22 = mulmod_scaled_x16(d22,precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d26 = mulmod_scaled_x16(d26,precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d30 = mulmod_scaled_x16(d30,precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e2 = _mm256_permute2x128_si256_lo(d2,d6); int16x16 e6 = _mm256_permute2x128_si256_hi(d2,d6); // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e10 = _mm256_permute2x128_si256_lo(d10,d14); int16x16 e14 = _mm256_permute2x128_si256_hi(d10,d14); // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e18 = _mm256_permute2x128_si256_lo(d18,d22); int16x16 e22 = _mm256_permute2x128_si256_hi(d18,d22); // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e26 = _mm256_permute2x128_si256_lo(d26,d30); int16x16 e30 = _mm256_permute2x128_si256_hi(d26,d30); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f+32),e2); _mm256_storeu_si256((int16x16 *) (f+96),e6); _mm256_storeu_si256((int16x16 *) (f+160),e10); _mm256_storeu_si256((int16x16 *) (f+224),e14); _mm256_storeu_si256((int16x16 *) (f+288),e18); _mm256_storeu_si256((int16x16 *) (f+352),e22); _mm256_storeu_si256((int16x16 *) (f+416),e26); _mm256_storeu_si256((int16x16 *) (f+480),e30); f += 512; } f -= 512*reps; // startbatch 512 for (long long r = 0;r < reps;++r) { // vector_butterfly 48 304 1 0 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48)); int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f+304)); int16x16 b3 = add_x16(a3,a19); int16x16 b19 = sub_x16(a3,a19); // vector_butterfly 176 432 1 0 int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f+176)); int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f+432)); int16x16 b11 = add_x16(a11,a27); int16x16 b27 = sub_x16(a11,a27); // vector_butterfly 112 368 1 0 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112)); int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f+368)); int16x16 b7 = add_x16(a7,a23); int16x16 b23 = sub_x16(a7,a23); // vector_butterfly 240 496 1 0 int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f+240)); int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f+496)); int16x16 b15 = add_x16(a15,a31); int16x16 b31 = sub_x16(a15,a31); // vector_reduce_ifreverse 48 // vector_reduce_ifreverse 304 // vector_butterfly 48 176 1 0 int16x16 c3 = add_x16(b3,b11); int16x16 c11 = sub_x16(b3,b11); // vector_butterfly 112 240 1 0 int16x16 c7 = add_x16(b7,b15); int16x16 c15 = sub_x16(b7,b15); // vector_butterfly 304 432 4 1 b27 = mulmod_scaled_x16(b27,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c19 = add_x16(b19,b27); int16x16 c27 = sub_x16(b19,b27); // vector_butterfly 368 496 4 1 b31 = mulmod_scaled_x16(b31,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c23 = add_x16(b23,b31); int16x16 c31 = sub_x16(b23,b31); // vector_reduce_ifforward 112 c7 = reduce_x16(c7,qdata); // vector_butterfly 48 112 1 0 int16x16 d3 = add_x16(c3,c7); int16x16 d7 = sub_x16(c3,c7); // vector_butterfly 176 240 4 1 c15 = mulmod_scaled_x16(c15,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 d11 = add_x16(c11,c15); int16x16 d15 = sub_x16(c11,c15); // vector_butterfly 304 368 8 1 c23 = mulmod_scaled_x16(c23,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); int16x16 d19 = add_x16(c19,c23); int16x16 d23 = sub_x16(c19,c23); // vector_butterfly 432 496 8 7 c31 = mulmod_scaled_x16(c31,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); int16x16 d27 = add_x16(c27,c31); int16x16 d31 = sub_x16(c27,c31); // vector_reduce 48 d3 = reduce_x16(d3,qdata); // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d7 = mulmod_scaled_x16(d7,precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d11 = mulmod_scaled_x16(d11,precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d15 = mulmod_scaled_x16(d15,precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d19 = mulmod_scaled_x16(d19,precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d23 = mulmod_scaled_x16(d23,precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d27 = mulmod_scaled_x16(d27,precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d31 = mulmod_scaled_x16(d31,precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e3 = _mm256_permute2x128_si256_lo(d3,d7); int16x16 e7 = _mm256_permute2x128_si256_hi(d3,d7); // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e11 = _mm256_permute2x128_si256_lo(d11,d15); int16x16 e15 = _mm256_permute2x128_si256_hi(d11,d15); // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e19 = _mm256_permute2x128_si256_lo(d19,d23); int16x16 e23 = _mm256_permute2x128_si256_hi(d19,d23); // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e27 = _mm256_permute2x128_si256_lo(d27,d31); int16x16 e31 = _mm256_permute2x128_si256_hi(d27,d31); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f+48),e3); _mm256_storeu_si256((int16x16 *) (f+112),e7); _mm256_storeu_si256((int16x16 *) (f+176),e11); _mm256_storeu_si256((int16x16 *) (f+240),e15); _mm256_storeu_si256((int16x16 *) (f+304),e19); _mm256_storeu_si256((int16x16 *) (f+368),e23); _mm256_storeu_si256((int16x16 *) (f+432),e27); _mm256_storeu_si256((int16x16 *) (f+496),e31); f += 512; } f -= 512*reps; // doublereps reps *= 2; // doublereps reps *= 2; // startbatch 128 for (long long r = 0;r < reps;++r) { // vector_butterfly 0 32 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0)); int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32)); int16x16 b0 = add_x16(a0,a2); int16x16 b2 = sub_x16(a0,a2); // vector_butterfly 64 96 1 0 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96)); int16x16 b4 = add_x16(a4,a6); int16x16 b6 = sub_x16(a4,a6); // vector_butterfly 16 48 1 0 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16)); int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48)); int16x16 b1 = add_x16(a1,a3); int16x16 b3 = sub_x16(a1,a3); // vector_butterfly 80 112 1 0 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112)); int16x16 b5 = add_x16(a5,a7); int16x16 b7 = sub_x16(a5,a7); // vector_butterfly 0 16 1 0 int16x16 c0 = add_x16(b0,b1); int16x16 c1 = sub_x16(b0,b1); // vector_butterfly 64 80 1 0 int16x16 c4 = add_x16(b4,b5); int16x16 c5 = sub_x16(b4,b5); // vector_butterfly 32 48 4 1 b3 = mulmod_scaled_x16(b3,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c2 = add_x16(b2,b3); int16x16 c3 = sub_x16(b2,b3); // vector_butterfly 96 112 4 1 b7 = mulmod_scaled_x16(b7,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 c6 = add_x16(b6,b7); int16x16 c7 = sub_x16(b6,b7); // vector_reduce 0 c0 = reduce_x16(c0,qdata); // vector_reduce 64 c4 = reduce_x16(c4,qdata); // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 c1 = mulmod_scaled_x16(c1,precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata); // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 c5 = mulmod_scaled_x16(c5,precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata); // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 c2 = mulmod_scaled_x16(c2,precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata); // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 c6 = mulmod_scaled_x16(c6,precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata); // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 c3 = mulmod_scaled_x16(c3,precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata); // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 c7 = mulmod_scaled_x16(c7,precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata); // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d0 = _mm256_unpacklo_epi16(c0,c2); int16x16 d2 = _mm256_unpackhi_epi16(c0,c2); // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d1 = _mm256_unpacklo_epi16(c1,c3); int16x16 d3 = _mm256_unpackhi_epi16(c1,c3); // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d4 = _mm256_unpacklo_epi16(c4,c6); int16x16 d6 = _mm256_unpackhi_epi16(c4,c6); // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d5 = _mm256_unpacklo_epi16(c5,c7); int16x16 d7 = _mm256_unpackhi_epi16(c5,c7); // vector_butterfly 0 64 1 0 int16x16 e0 = add_x16(d0,d4); int16x16 e4 = sub_x16(d0,d4); // vector_butterfly 32 96 1 0 int16x16 e2 = add_x16(d2,d6); int16x16 e6 = sub_x16(d2,d6); // vector_butterfly 16 80 1 0 int16x16 e1 = add_x16(d1,d5); int16x16 e5 = sub_x16(d1,d5); // vector_butterfly 48 112 1 0 int16x16 e3 = add_x16(d3,d7); int16x16 e7 = sub_x16(d3,d7); // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f0 = _mm256_unpacklo_epi32(e0,e1); int16x16 f1 = _mm256_unpackhi_epi32(e0,e1); // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f2 = _mm256_unpacklo_epi32(e2,e3); int16x16 f3 = _mm256_unpackhi_epi32(e2,e3); // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f4 = _mm256_unpacklo_epi32(e4,e5); int16x16 f5 = _mm256_unpackhi_epi32(e4,e5); // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f6 = _mm256_unpacklo_epi32(e6,e7); int16x16 f7 = _mm256_unpackhi_epi32(e6,e7); // stopbatch 128 _mm256_storeu_si256((int16x16 *) (f+0),f0); _mm256_storeu_si256((int16x16 *) (f+16),f1); _mm256_storeu_si256((int16x16 *) (f+32),f2); _mm256_storeu_si256((int16x16 *) (f+48),f3); _mm256_storeu_si256((int16x16 *) (f+64),f4); _mm256_storeu_si256((int16x16 *) (f+80),f5); _mm256_storeu_si256((int16x16 *) (f+96),f6); _mm256_storeu_si256((int16x16 *) (f+112),f7); f += 128; } f -= 128*reps; // startbatch 128 for (long long r = 0;r < reps;++r) { // vector_butterfly 0 32 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0)); int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32)); int16x16 b0 = add_x16(a0,a2); int16x16 b2 = sub_x16(a0,a2); // vector_butterfly 16 48 1 0 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16)); int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48)); int16x16 b1 = add_x16(a1,a3); int16x16 b3 = sub_x16(a1,a3); // vector_butterfly 64 96 4 1 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96)); a6 = mulmod_scaled_x16(a6,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 b4 = add_x16(a4,a6); int16x16 b6 = sub_x16(a4,a6); // vector_butterfly 80 112 4 1 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112)); a7 = mulmod_scaled_x16(a7,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 b5 = add_x16(a5,a7); int16x16 b7 = sub_x16(a5,a7); // vector_reduce 0 b0 = reduce_x16(b0,qdata); // vector_reduce 16 b1 = reduce_x16(b1,qdata); // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 b2 = mulmod_scaled_x16(b2,precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata); // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 b3 = mulmod_scaled_x16(b3,precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata); // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 b4 = mulmod_scaled_x16(b4,precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata); // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 b5 = mulmod_scaled_x16(b5,precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata); // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 b6 = mulmod_scaled_x16(b6,precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata); // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 b7 = mulmod_scaled_x16(b7,precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata); // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c0 = _mm256_unpacklo_epi64(b0,b4); int16x16 c4 = _mm256_unpackhi_epi64(b0,b4); // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c1 = _mm256_unpacklo_epi64(b1,b5); int16x16 c5 = _mm256_unpackhi_epi64(b1,b5); // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c2 = _mm256_unpacklo_epi64(b2,b6); int16x16 c6 = _mm256_unpackhi_epi64(b2,b6); // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c3 = _mm256_unpacklo_epi64(b3,b7); int16x16 c7 = _mm256_unpackhi_epi64(b3,b7); // vector_butterfly 0 16 1 0 int16x16 d0 = add_x16(c0,c1); int16x16 d1 = sub_x16(c0,c1); // vector_butterfly 64 80 1 0 int16x16 d4 = add_x16(c4,c5); int16x16 d5 = sub_x16(c4,c5); // vector_butterfly 32 48 1 0 int16x16 d2 = add_x16(c2,c3); int16x16 d3 = sub_x16(c2,c3); // vector_butterfly 96 112 1 0 int16x16 d6 = add_x16(c6,c7); int16x16 d7 = sub_x16(c6,c7); // vector_butterfly 0 64 1 0 int16x16 e0 = add_x16(d0,d4); int16x16 e4 = sub_x16(d0,d4); // vector_butterfly 32 96 1 0 int16x16 e2 = add_x16(d2,d6); int16x16 e6 = sub_x16(d2,d6); // vector_butterfly 16 80 4 1 d5 = mulmod_scaled_x16(d5,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 e1 = add_x16(d1,d5); int16x16 e5 = sub_x16(d1,d5); // vector_butterfly 48 112 4 1 d7 = mulmod_scaled_x16(d7,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata); int16x16 e3 = add_x16(d3,d7); int16x16 e7 = sub_x16(d3,d7); // stopbatch 128 _mm256_storeu_si256((int16x16 *) (f+0),e0); _mm256_storeu_si256((int16x16 *) (f+16),e1); _mm256_storeu_si256((int16x16 *) (f+32),e2); _mm256_storeu_si256((int16x16 *) (f+48),e3); _mm256_storeu_si256((int16x16 *) (f+64),e4); _mm256_storeu_si256((int16x16 *) (f+80),e5); _mm256_storeu_si256((int16x16 *) (f+96),e6); _mm256_storeu_si256((int16x16 *) (f+112),e7); f += 128; } f -= 128*reps; // stopntt 512 } void ntt512_7681(int16 *f,int reps) { ntt512(f,reps,qdata_7681); } void ntt512_10753(int16 *f,int reps) { ntt512(f,reps,qdata_10753); } // inv stopntt 512 static void invntt512(int16 *f,int reps,const int16 *qdata) { reps *= 4; // inv stopbatch 128 for (long long r = 0;r < reps;++r) { // inv vector_butterfly 48 112 4 1 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112)); int16x16 b3 = add_x16(a3,a7); int16x16 b7 = sub_x16(a3,a7); b7 = mulmod_scaled_x16(b7,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 16 80 4 1 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16)); int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80)); int16x16 b1 = add_x16(a1,a5); int16x16 b5 = sub_x16(a1,a5); b5 = mulmod_scaled_x16(b5,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 32 96 1 0 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96)); int16x16 b2 = add_x16(a2,a6); int16x16 b6 = sub_x16(a2,a6); // inv vector_butterfly 0 64 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0)); int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64)); int16x16 b0 = add_x16(a0,a4); int16x16 b4 = sub_x16(a0,a4); // inv vector_butterfly 96 112 1 0 int16x16 c6 = add_x16(b6,b7); int16x16 c7 = sub_x16(b6,b7); // inv vector_butterfly 32 48 1 0 int16x16 c2 = add_x16(b2,b3); int16x16 c3 = sub_x16(b2,b3); // inv vector_butterfly 64 80 1 0 int16x16 c4 = add_x16(b4,b5); int16x16 c5 = sub_x16(b4,b5); // inv vector_butterfly 0 16 1 0 int16x16 c0 = add_x16(b0,b1); int16x16 c1 = sub_x16(b0,b1); // inv vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d3 = _mm256_unpacklo_epi64(c3,c7); int16x16 d7 = _mm256_unpackhi_epi64(c3,c7); // inv vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d2 = _mm256_unpacklo_epi64(c2,c6); int16x16 d6 = _mm256_unpackhi_epi64(c2,c6); // inv vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d1 = _mm256_unpacklo_epi64(c1,c5); int16x16 d5 = _mm256_unpackhi_epi64(c1,c5); // inv vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d0 = _mm256_unpacklo_epi64(c0,c4); int16x16 d4 = _mm256_unpackhi_epi64(c0,c4); // inv vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 d7 = mulmod_scaled_x16(d7,precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata); // inv vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 d6 = mulmod_scaled_x16(d6,precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata); // inv vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 d5 = mulmod_scaled_x16(d5,precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata); // inv vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 d4 = mulmod_scaled_x16(d4,precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata); // inv vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 d3 = mulmod_scaled_x16(d3,precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata); // inv vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 d2 = mulmod_scaled_x16(d2,precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata); // inv vector_reduce 16 d1 = reduce_x16(d1,qdata); // inv vector_reduce 0 d0 = reduce_x16(d0,qdata); // inv vector_butterfly 80 112 4 1 int16x16 e5 = add_x16(d5,d7); int16x16 e7 = sub_x16(d5,d7); e7 = mulmod_scaled_x16(e7,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 64 96 4 1 int16x16 e4 = add_x16(d4,d6); int16x16 e6 = sub_x16(d4,d6); e6 = mulmod_scaled_x16(e6,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 16 48 1 0 int16x16 e1 = add_x16(d1,d3); int16x16 e3 = sub_x16(d1,d3); // inv vector_butterfly 0 32 1 0 int16x16 e0 = add_x16(d0,d2); int16x16 e2 = sub_x16(d0,d2); // inv startbatch 128 _mm256_storeu_si256((int16x16 *) (f+0),e0); _mm256_storeu_si256((int16x16 *) (f+16),e1); _mm256_storeu_si256((int16x16 *) (f+32),e2); _mm256_storeu_si256((int16x16 *) (f+48),e3); _mm256_storeu_si256((int16x16 *) (f+64),e4); _mm256_storeu_si256((int16x16 *) (f+80),e5); _mm256_storeu_si256((int16x16 *) (f+96),e6); _mm256_storeu_si256((int16x16 *) (f+112),e7); f += 128; } f -= 128*reps; // inv stopbatch 128 for (long long r = 0;r < reps;++r) { // inv vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112)); int16x16 b6 = _mm256_unpacklo_epi32(a6,a7); int16x16 b7 = _mm256_unpackhi_epi32(a6,a7); int16x16 c6 = _mm256_unpacklo_epi32(b6,b7); int16x16 c7 = _mm256_unpackhi_epi32(b6,b7); // inv vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64)); int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80)); int16x16 b4 = _mm256_unpacklo_epi32(a4,a5); int16x16 b5 = _mm256_unpackhi_epi32(a4,a5); int16x16 c4 = _mm256_unpacklo_epi32(b4,b5); int16x16 c5 = _mm256_unpackhi_epi32(b4,b5); // inv vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32)); int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48)); int16x16 b2 = _mm256_unpacklo_epi32(a2,a3); int16x16 b3 = _mm256_unpackhi_epi32(a2,a3); int16x16 c2 = _mm256_unpacklo_epi32(b2,b3); int16x16 c3 = _mm256_unpackhi_epi32(b2,b3); // inv vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0)); int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16)); int16x16 b0 = _mm256_unpacklo_epi32(a0,a1); int16x16 b1 = _mm256_unpackhi_epi32(a0,a1); int16x16 c0 = _mm256_unpacklo_epi32(b0,b1); int16x16 c1 = _mm256_unpackhi_epi32(b0,b1); // inv vector_butterfly 48 112 1 0 int16x16 d3 = add_x16(c3,c7); int16x16 d7 = sub_x16(c3,c7); // inv vector_butterfly 16 80 1 0 int16x16 d1 = add_x16(c1,c5); int16x16 d5 = sub_x16(c1,c5); // inv vector_butterfly 32 96 1 0 int16x16 d2 = add_x16(c2,c6); int16x16 d6 = sub_x16(c2,c6); // inv vector_butterfly 0 64 1 0 int16x16 d0 = add_x16(c0,c4); int16x16 d4 = sub_x16(c0,c4); // inv vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e5 = _mm256_unpacklo_epi16(d5,d7); int16x16 e7 = _mm256_unpackhi_epi16(d5,d7); int16x16 f5 = _mm256_unpacklo_epi16(e5,e7); int16x16 f7 = _mm256_unpackhi_epi16(e5,e7); int16x16 g5 = _mm256_unpacklo_epi16(f5,f7); int16x16 g7 = _mm256_unpackhi_epi16(f5,f7); // inv vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e4 = _mm256_unpacklo_epi16(d4,d6); int16x16 e6 = _mm256_unpackhi_epi16(d4,d6); int16x16 f4 = _mm256_unpacklo_epi16(e4,e6); int16x16 f6 = _mm256_unpackhi_epi16(e4,e6); int16x16 g4 = _mm256_unpacklo_epi16(f4,f6); int16x16 g6 = _mm256_unpackhi_epi16(f4,f6); // inv vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e1 = _mm256_unpacklo_epi16(d1,d3); int16x16 e3 = _mm256_unpackhi_epi16(d1,d3); int16x16 f1 = _mm256_unpacklo_epi16(e1,e3); int16x16 f3 = _mm256_unpackhi_epi16(e1,e3); int16x16 g1 = _mm256_unpacklo_epi16(f1,f3); int16x16 g3 = _mm256_unpackhi_epi16(f1,f3); // inv vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e0 = _mm256_unpacklo_epi16(d0,d2); int16x16 e2 = _mm256_unpackhi_epi16(d0,d2); int16x16 f0 = _mm256_unpacklo_epi16(e0,e2); int16x16 f2 = _mm256_unpackhi_epi16(e0,e2); int16x16 g0 = _mm256_unpacklo_epi16(f0,f2); int16x16 g2 = _mm256_unpackhi_epi16(f0,f2); // inv vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 g7 = mulmod_scaled_x16(g7,precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 g3 = mulmod_scaled_x16(g3,precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata); // inv vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 g6 = mulmod_scaled_x16(g6,precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 g2 = mulmod_scaled_x16(g2,precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata); // inv vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 g5 = mulmod_scaled_x16(g5,precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 g1 = mulmod_scaled_x16(g1,precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata); // inv vector_reduce 64 g4 = reduce_x16(g4,qdata); // inv vector_reduce 0 g0 = reduce_x16(g0,qdata); // inv vector_butterfly 96 112 4 1 int16x16 h6 = add_x16(g6,g7); int16x16 h7 = sub_x16(g6,g7); h7 = mulmod_scaled_x16(h7,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 32 48 4 1 int16x16 h2 = add_x16(g2,g3); int16x16 h3 = sub_x16(g2,g3); h3 = mulmod_scaled_x16(h3,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 64 80 1 0 int16x16 h4 = add_x16(g4,g5); int16x16 h5 = sub_x16(g4,g5); // inv vector_butterfly 0 16 1 0 int16x16 h0 = add_x16(g0,g1); int16x16 h1 = sub_x16(g0,g1); // inv vector_butterfly 80 112 1 0 int16x16 i5 = add_x16(h5,h7); int16x16 i7 = sub_x16(h5,h7); // inv vector_butterfly 16 48 1 0 int16x16 i1 = add_x16(h1,h3); int16x16 i3 = sub_x16(h1,h3); // inv vector_butterfly 64 96 1 0 int16x16 i4 = add_x16(h4,h6); int16x16 i6 = sub_x16(h4,h6); // inv vector_butterfly 0 32 1 0 int16x16 i0 = add_x16(h0,h2); int16x16 i2 = sub_x16(h0,h2); // inv startbatch 128 _mm256_storeu_si256((int16x16 *) (f+0),i0); _mm256_storeu_si256((int16x16 *) (f+16),i1); _mm256_storeu_si256((int16x16 *) (f+32),i2); _mm256_storeu_si256((int16x16 *) (f+48),i3); _mm256_storeu_si256((int16x16 *) (f+64),i4); _mm256_storeu_si256((int16x16 *) (f+80),i5); _mm256_storeu_si256((int16x16 *) (f+96),i6); _mm256_storeu_si256((int16x16 *) (f+112),i7); f += 128; } f -= 128*reps; // inv doublereps reps /= 2; // inv doublereps reps /= 2; // inv stopbatch 512 for (long long r = 0;r < reps;++r) { // inv vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f+432)); int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f+496)); int16x16 b27 = _mm256_permute2x128_si256_lo(a27,a31); int16x16 b31 = _mm256_permute2x128_si256_hi(a27,a31); // inv vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f+304)); int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f+368)); int16x16 b19 = _mm256_permute2x128_si256_lo(a19,a23); int16x16 b23 = _mm256_permute2x128_si256_hi(a19,a23); // inv vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f+176)); int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f+240)); int16x16 b11 = _mm256_permute2x128_si256_lo(a11,a15); int16x16 b15 = _mm256_permute2x128_si256_hi(a11,a15); // inv vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112)); int16x16 b3 = _mm256_permute2x128_si256_lo(a3,a7); int16x16 b7 = _mm256_permute2x128_si256_hi(a3,a7); // inv vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b31 = mulmod_scaled_x16(b31,precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // inv vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b27 = mulmod_scaled_x16(b27,precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // inv vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b23 = mulmod_scaled_x16(b23,precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // inv vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b19 = mulmod_scaled_x16(b19,precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // inv vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b15 = mulmod_scaled_x16(b15,precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // inv vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b11 = mulmod_scaled_x16(b11,precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // inv vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b7 = mulmod_scaled_x16(b7,precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata); // inv vector_reduce 48 b3 = reduce_x16(b3,qdata); // inv vector_butterfly 432 496 8 7 int16x16 c27 = add_x16(b27,b31); int16x16 c31 = sub_x16(b27,b31); c31 = mulmod_scaled_x16(c31,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); // inv vector_butterfly 304 368 8 1 int16x16 c19 = add_x16(b19,b23); int16x16 c23 = sub_x16(b19,b23); c23 = mulmod_scaled_x16(c23,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); // inv vector_butterfly 176 240 4 1 int16x16 c11 = add_x16(b11,b15); int16x16 c15 = sub_x16(b11,b15); c15 = mulmod_scaled_x16(c15,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 48 112 1 0 int16x16 c3 = add_x16(b3,b7); int16x16 c7 = sub_x16(b3,b7); // inv vector_reduce_ifforward 112 // inv vector_butterfly 368 496 4 1 int16x16 d23 = add_x16(c23,c31); int16x16 d31 = sub_x16(c23,c31); d31 = mulmod_scaled_x16(d31,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 304 432 4 1 int16x16 d19 = add_x16(c19,c27); int16x16 d27 = sub_x16(c19,c27); d27 = mulmod_scaled_x16(d27,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 112 240 1 0 int16x16 d7 = add_x16(c7,c15); int16x16 d15 = sub_x16(c7,c15); // inv vector_butterfly 48 176 1 0 int16x16 d3 = add_x16(c3,c11); int16x16 d11 = sub_x16(c3,c11); // inv vector_reduce_ifreverse 304 d19 = reduce_x16(d19,qdata); // inv vector_reduce_ifreverse 48 d3 = reduce_x16(d3,qdata); // inv vector_butterfly 240 496 1 0 int16x16 e15 = add_x16(d15,d31); int16x16 e31 = sub_x16(d15,d31); // inv vector_butterfly 112 368 1 0 int16x16 e7 = add_x16(d7,d23); int16x16 e23 = sub_x16(d7,d23); // inv vector_butterfly 176 432 1 0 int16x16 e11 = add_x16(d11,d27); int16x16 e27 = sub_x16(d11,d27); // inv vector_butterfly 48 304 1 0 int16x16 e3 = add_x16(d3,d19); int16x16 e19 = sub_x16(d3,d19); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f+48),e3); _mm256_storeu_si256((int16x16 *) (f+112),e7); _mm256_storeu_si256((int16x16 *) (f+176),e11); _mm256_storeu_si256((int16x16 *) (f+240),e15); _mm256_storeu_si256((int16x16 *) (f+304),e19); _mm256_storeu_si256((int16x16 *) (f+368),e23); _mm256_storeu_si256((int16x16 *) (f+432),e27); _mm256_storeu_si256((int16x16 *) (f+496),e31); f += 512; } f -= 512*reps; // inv stopbatch 512 for (long long r = 0;r < reps;++r) { // inv vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f+416)); int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f+480)); int16x16 b26 = _mm256_permute2x128_si256_lo(a26,a30); int16x16 b30 = _mm256_permute2x128_si256_hi(a26,a30); // inv vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f+288)); int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f+352)); int16x16 b18 = _mm256_permute2x128_si256_lo(a18,a22); int16x16 b22 = _mm256_permute2x128_si256_hi(a18,a22); // inv vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f+160)); int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f+224)); int16x16 b10 = _mm256_permute2x128_si256_lo(a10,a14); int16x16 b14 = _mm256_permute2x128_si256_hi(a10,a14); // inv vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96)); int16x16 b2 = _mm256_permute2x128_si256_lo(a2,a6); int16x16 b6 = _mm256_permute2x128_si256_hi(a2,a6); // inv vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b30 = mulmod_scaled_x16(b30,precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // inv vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b26 = mulmod_scaled_x16(b26,precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // inv vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b22 = mulmod_scaled_x16(b22,precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // inv vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b18 = mulmod_scaled_x16(b18,precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // inv vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b14 = mulmod_scaled_x16(b14,precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // inv vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b10 = mulmod_scaled_x16(b10,precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // inv vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b6 = mulmod_scaled_x16(b6,precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata); // inv vector_reduce 32 b2 = reduce_x16(b2,qdata); // inv vector_butterfly 416 480 8 7 int16x16 c26 = add_x16(b26,b30); int16x16 c30 = sub_x16(b26,b30); c30 = mulmod_scaled_x16(c30,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); // inv vector_butterfly 288 352 8 1 int16x16 c18 = add_x16(b18,b22); int16x16 c22 = sub_x16(b18,b22); c22 = mulmod_scaled_x16(c22,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); // inv vector_butterfly 160 224 4 1 int16x16 c10 = add_x16(b10,b14); int16x16 c14 = sub_x16(b10,b14); c14 = mulmod_scaled_x16(c14,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 32 96 1 0 int16x16 c2 = add_x16(b2,b6); int16x16 c6 = sub_x16(b2,b6); // inv vector_reduce_ifforward 96 // inv vector_butterfly 352 480 4 1 int16x16 d22 = add_x16(c22,c30); int16x16 d30 = sub_x16(c22,c30); d30 = mulmod_scaled_x16(d30,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 288 416 4 1 int16x16 d18 = add_x16(c18,c26); int16x16 d26 = sub_x16(c18,c26); d26 = mulmod_scaled_x16(d26,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 96 224 1 0 int16x16 d6 = add_x16(c6,c14); int16x16 d14 = sub_x16(c6,c14); // inv vector_butterfly 32 160 1 0 int16x16 d2 = add_x16(c2,c10); int16x16 d10 = sub_x16(c2,c10); // inv vector_reduce_ifreverse 288 d18 = reduce_x16(d18,qdata); // inv vector_reduce_ifreverse 32 d2 = reduce_x16(d2,qdata); // inv vector_butterfly 224 480 1 0 int16x16 e14 = add_x16(d14,d30); int16x16 e30 = sub_x16(d14,d30); // inv vector_butterfly 96 352 1 0 int16x16 e6 = add_x16(d6,d22); int16x16 e22 = sub_x16(d6,d22); // inv vector_butterfly 160 416 1 0 int16x16 e10 = add_x16(d10,d26); int16x16 e26 = sub_x16(d10,d26); // inv vector_butterfly 32 288 1 0 int16x16 e2 = add_x16(d2,d18); int16x16 e18 = sub_x16(d2,d18); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f+32),e2); _mm256_storeu_si256((int16x16 *) (f+96),e6); _mm256_storeu_si256((int16x16 *) (f+160),e10); _mm256_storeu_si256((int16x16 *) (f+224),e14); _mm256_storeu_si256((int16x16 *) (f+288),e18); _mm256_storeu_si256((int16x16 *) (f+352),e22); _mm256_storeu_si256((int16x16 *) (f+416),e26); _mm256_storeu_si256((int16x16 *) (f+480),e30); f += 512; } f -= 512*reps; // inv stopbatch 512 for (long long r = 0;r < reps;++r) { // inv vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f+400)); int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f+464)); int16x16 b25 = _mm256_permute2x128_si256_lo(a25,a29); int16x16 b29 = _mm256_permute2x128_si256_hi(a25,a29); // inv vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f+272)); int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f+336)); int16x16 b17 = _mm256_permute2x128_si256_lo(a17,a21); int16x16 b21 = _mm256_permute2x128_si256_hi(a17,a21); // inv vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f+144)); int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f+208)); int16x16 b9 = _mm256_permute2x128_si256_lo(a9,a13); int16x16 b13 = _mm256_permute2x128_si256_hi(a9,a13); // inv vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16)); int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80)); int16x16 b1 = _mm256_permute2x128_si256_lo(a1,a5); int16x16 b5 = _mm256_permute2x128_si256_hi(a1,a5); // inv vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b29 = mulmod_scaled_x16(b29,precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // inv vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b25 = mulmod_scaled_x16(b25,precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // inv vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b21 = mulmod_scaled_x16(b21,precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // inv vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b17 = mulmod_scaled_x16(b17,precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // inv vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b13 = mulmod_scaled_x16(b13,precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // inv vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b9 = mulmod_scaled_x16(b9,precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // inv vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b5 = mulmod_scaled_x16(b5,precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata); // inv vector_reduce 16 b1 = reduce_x16(b1,qdata); // inv vector_butterfly 400 464 8 7 int16x16 c25 = add_x16(b25,b29); int16x16 c29 = sub_x16(b25,b29); c29 = mulmod_scaled_x16(c29,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); // inv vector_butterfly 272 336 8 1 int16x16 c17 = add_x16(b17,b21); int16x16 c21 = sub_x16(b17,b21); c21 = mulmod_scaled_x16(c21,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); // inv vector_butterfly 144 208 4 1 int16x16 c9 = add_x16(b9,b13); int16x16 c13 = sub_x16(b9,b13); c13 = mulmod_scaled_x16(c13,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 16 80 1 0 int16x16 c1 = add_x16(b1,b5); int16x16 c5 = sub_x16(b1,b5); // inv vector_reduce_ifforward 80 // inv vector_butterfly 336 464 4 1 int16x16 d21 = add_x16(c21,c29); int16x16 d29 = sub_x16(c21,c29); d29 = mulmod_scaled_x16(d29,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 272 400 4 1 int16x16 d17 = add_x16(c17,c25); int16x16 d25 = sub_x16(c17,c25); d25 = mulmod_scaled_x16(d25,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 80 208 1 0 int16x16 d5 = add_x16(c5,c13); int16x16 d13 = sub_x16(c5,c13); // inv vector_butterfly 16 144 1 0 int16x16 d1 = add_x16(c1,c9); int16x16 d9 = sub_x16(c1,c9); // inv vector_reduce_ifreverse 272 d17 = reduce_x16(d17,qdata); // inv vector_reduce_ifreverse 16 d1 = reduce_x16(d1,qdata); // inv vector_butterfly 208 464 1 0 int16x16 e13 = add_x16(d13,d29); int16x16 e29 = sub_x16(d13,d29); // inv vector_butterfly 80 336 1 0 int16x16 e5 = add_x16(d5,d21); int16x16 e21 = sub_x16(d5,d21); // inv vector_butterfly 144 400 1 0 int16x16 e9 = add_x16(d9,d25); int16x16 e25 = sub_x16(d9,d25); // inv vector_butterfly 16 272 1 0 int16x16 e1 = add_x16(d1,d17); int16x16 e17 = sub_x16(d1,d17); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f+16),e1); _mm256_storeu_si256((int16x16 *) (f+80),e5); _mm256_storeu_si256((int16x16 *) (f+144),e9); _mm256_storeu_si256((int16x16 *) (f+208),e13); _mm256_storeu_si256((int16x16 *) (f+272),e17); _mm256_storeu_si256((int16x16 *) (f+336),e21); _mm256_storeu_si256((int16x16 *) (f+400),e25); _mm256_storeu_si256((int16x16 *) (f+464),e29); f += 512; } f -= 512*reps; // inv stopbatch 512 for (long long r = 0;r < reps;++r) { // inv vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f+384)); int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f+448)); int16x16 b24 = _mm256_permute2x128_si256_lo(a24,a28); int16x16 b28 = _mm256_permute2x128_si256_hi(a24,a28); // inv vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f+256)); int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f+320)); int16x16 b16 = _mm256_permute2x128_si256_lo(a16,a20); int16x16 b20 = _mm256_permute2x128_si256_hi(a16,a20); // inv vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f+128)); int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f+192)); int16x16 b8 = _mm256_permute2x128_si256_lo(a8,a12); int16x16 b12 = _mm256_permute2x128_si256_hi(a8,a12); // inv vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0)); int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64)); int16x16 b0 = _mm256_permute2x128_si256_lo(a0,a4); int16x16 b4 = _mm256_permute2x128_si256_hi(a0,a4); // inv vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b28 = mulmod_scaled_x16(b28,precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b24 = mulmod_scaled_x16(b24,precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b20 = mulmod_scaled_x16(b20,precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b16 = mulmod_scaled_x16(b16,precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b12 = mulmod_scaled_x16(b12,precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b8 = mulmod_scaled_x16(b8,precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // inv vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b4 = mulmod_scaled_x16(b4,precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata); // inv vector_reduce 0 b0 = reduce_x16(b0,qdata); // inv vector_butterfly 384 448 8 7 int16x16 c24 = add_x16(b24,b28); int16x16 c28 = sub_x16(b24,b28); c28 = mulmod_scaled_x16(c28,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata); // inv vector_butterfly 256 320 8 1 int16x16 c16 = add_x16(b16,b20); int16x16 c20 = sub_x16(b16,b20); c20 = mulmod_scaled_x16(c20,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata); // inv vector_butterfly 128 192 4 1 int16x16 c8 = add_x16(b8,b12); int16x16 c12 = sub_x16(b8,b12); c12 = mulmod_scaled_x16(c12,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 0 64 1 0 int16x16 c0 = add_x16(b0,b4); int16x16 c4 = sub_x16(b0,b4); // inv vector_reduce_ifforward 64 // inv vector_butterfly 320 448 4 1 int16x16 d20 = add_x16(c20,c28); int16x16 d28 = sub_x16(c20,c28); d28 = mulmod_scaled_x16(d28,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 256 384 4 1 int16x16 d16 = add_x16(c16,c24); int16x16 d24 = sub_x16(c16,c24); d24 = mulmod_scaled_x16(d24,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata); // inv vector_butterfly 64 192 1 0 int16x16 d4 = add_x16(c4,c12); int16x16 d12 = sub_x16(c4,c12); // inv vector_butterfly 0 128 1 0 int16x16 d0 = add_x16(c0,c8); int16x16 d8 = sub_x16(c0,c8); // inv vector_reduce_ifreverse 256 d16 = reduce_x16(d16,qdata); // inv vector_reduce_ifreverse 0 d0 = reduce_x16(d0,qdata); // inv vector_butterfly 192 448 1 0 int16x16 e12 = add_x16(d12,d28); int16x16 e28 = sub_x16(d12,d28); // inv vector_butterfly 64 320 1 0 int16x16 e4 = add_x16(d4,d20); int16x16 e20 = sub_x16(d4,d20); // inv vector_butterfly 128 384 1 0 int16x16 e8 = add_x16(d8,d24); int16x16 e24 = sub_x16(d8,d24); // inv vector_butterfly 0 256 1 0 int16x16 e0 = add_x16(d0,d16); int16x16 e16 = sub_x16(d0,d16); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f+0),e0); _mm256_storeu_si256((int16x16 *) (f+64),e4); _mm256_storeu_si256((int16x16 *) (f+128),e8); _mm256_storeu_si256((int16x16 *) (f+192),e12); _mm256_storeu_si256((int16x16 *) (f+256),e16); _mm256_storeu_si256((int16x16 *) (f+320),e20); _mm256_storeu_si256((int16x16 *) (f+384),e24); _mm256_storeu_si256((int16x16 *) (f+448),e28); f += 512; } f -= 512*reps; // inv startntt 512 } void invntt512_7681(int16 *f,int reps) { invntt512(f,reps,qdata_7681); } void invntt512_10753(int16 *f,int reps) { invntt512(f,reps,qdata_10753); }