xref: /OK3568_Linux_fs/buildroot/package/x265/0002-arm-asm-primitives.patch (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1Fixes "arm assembly fail to compile on 1.8"
2
3Downloaded from upstream bug report:
4https://bitbucket.org/multicoreware/x265/issues/406
5
6Signed-off-by: Bernd Kuhls <bernd.kuhls@t-online.de>
7
8--- ./source/common/arm/asm-primitives.cpp.orig	2018-05-21 02:33:10.000000000 -0600
9+++ ./source/common/arm/asm-primitives.cpp	2018-05-28 20:38:37.302378303 -0600
10@@ -48,77 +48,77 @@ void setupAssemblyPrimitives(EncoderPrim
11         p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
12
13         // addAvg
14-         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
15-         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
16-         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
17-         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
18-         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
19-         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
20-         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
21-         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
22-         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
23-         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
24-         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
25-         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
26-         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
27-         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
28-         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
29-         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
30-         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
31-         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
32-         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
33-         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
34-         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
35-         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
36-         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
37-         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
38-         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
39+         p.pu[LUMA_4x4].addAvg[ALIGNED]   = PFX(addAvg_4x4_neon);
40+         p.pu[LUMA_4x8].addAvg[ALIGNED]   = PFX(addAvg_4x8_neon);
41+         p.pu[LUMA_4x16].addAvg[ALIGNED]  = PFX(addAvg_4x16_neon);
42+         p.pu[LUMA_8x4].addAvg[ALIGNED]   = PFX(addAvg_8x4_neon);
43+         p.pu[LUMA_8x8].addAvg[ALIGNED]   = PFX(addAvg_8x8_neon);
44+         p.pu[LUMA_8x16].addAvg[ALIGNED]  = PFX(addAvg_8x16_neon);
45+         p.pu[LUMA_8x32].addAvg[ALIGNED]  = PFX(addAvg_8x32_neon);
46+         p.pu[LUMA_12x16].addAvg[ALIGNED] = PFX(addAvg_12x16_neon);
47+         p.pu[LUMA_16x4].addAvg[ALIGNED]  = PFX(addAvg_16x4_neon);
48+         p.pu[LUMA_16x8].addAvg[ALIGNED]  = PFX(addAvg_16x8_neon);
49+         p.pu[LUMA_16x12].addAvg[ALIGNED] = PFX(addAvg_16x12_neon);
50+         p.pu[LUMA_16x16].addAvg[ALIGNED] = PFX(addAvg_16x16_neon);
51+         p.pu[LUMA_16x32].addAvg[ALIGNED] = PFX(addAvg_16x32_neon);
52+         p.pu[LUMA_16x64].addAvg[ALIGNED] = PFX(addAvg_16x64_neon);
53+         p.pu[LUMA_24x32].addAvg[ALIGNED] = PFX(addAvg_24x32_neon);
54+         p.pu[LUMA_32x8].addAvg[ALIGNED]  = PFX(addAvg_32x8_neon);
55+         p.pu[LUMA_32x16].addAvg[ALIGNED] = PFX(addAvg_32x16_neon);
56+         p.pu[LUMA_32x24].addAvg[ALIGNED] = PFX(addAvg_32x24_neon);
57+         p.pu[LUMA_32x32].addAvg[ALIGNED] = PFX(addAvg_32x32_neon);
58+         p.pu[LUMA_32x64].addAvg[ALIGNED] = PFX(addAvg_32x64_neon);
59+         p.pu[LUMA_48x64].addAvg[ALIGNED] = PFX(addAvg_48x64_neon);
60+         p.pu[LUMA_64x16].addAvg[ALIGNED] = PFX(addAvg_64x16_neon);
61+         p.pu[LUMA_64x32].addAvg[ALIGNED] = PFX(addAvg_64x32_neon);
62+         p.pu[LUMA_64x48].addAvg[ALIGNED] = PFX(addAvg_64x48_neon);
63+         p.pu[LUMA_64x64].addAvg[ALIGNED] = PFX(addAvg_64x64_neon);
64
65         // chroma addAvg
66-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
67-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
68-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
69-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
70-        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
71-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
72-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
73-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
74-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
75-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
76-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
77-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
78-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
79-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
80-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
81-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
82-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
83-        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
84-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
85-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
86-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
87-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
88-
89-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
90-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
91-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
92-        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
93-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
94-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
95-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
96-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
97-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
98-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
99-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
100-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
101-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
102-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
103-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
104-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
105-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
106-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
107-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
108-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
109-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
110+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[ALIGNED]   = PFX(addAvg_4x2_neon);
111+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[ALIGNED]   = PFX(addAvg_4x4_neon);
112+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[ALIGNED]   = PFX(addAvg_4x8_neon);
113+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[ALIGNED]  = PFX(addAvg_4x16_neon);
114+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[ALIGNED]   = PFX(addAvg_6x8_neon);
115+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[ALIGNED]   = PFX(addAvg_8x2_neon);
116+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[ALIGNED]   = PFX(addAvg_8x4_neon);
117+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[ALIGNED]   = PFX(addAvg_8x6_neon);
118+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[ALIGNED]   = PFX(addAvg_8x8_neon);
119+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[ALIGNED]  = PFX(addAvg_8x16_neon);
120+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[ALIGNED]  = PFX(addAvg_8x32_neon);
121+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[ALIGNED] = PFX(addAvg_12x16_neon);
122+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[ALIGNED]  = PFX(addAvg_16x4_neon);
123+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[ALIGNED]  = PFX(addAvg_16x8_neon);
124+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[ALIGNED] = PFX(addAvg_16x12_neon);
125+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[ALIGNED] = PFX(addAvg_16x16_neon);
126+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[ALIGNED] = PFX(addAvg_16x32_neon);
127+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[ALIGNED] = PFX(addAvg_24x32_neon);
128+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[ALIGNED]  = PFX(addAvg_32x8_neon);
129+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[ALIGNED] = PFX(addAvg_32x16_neon);
130+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[ALIGNED] = PFX(addAvg_32x24_neon);
131+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[ALIGNED] = PFX(addAvg_32x32_neon);
132+
133+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[ALIGNED]   = PFX(addAvg_4x8_neon);
134+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[ALIGNED]  = PFX(addAvg_4x16_neon);
135+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[ALIGNED]  = PFX(addAvg_4x32_neon);
136+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[ALIGNED]  = PFX(addAvg_6x16_neon);
137+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[ALIGNED]   = PFX(addAvg_8x4_neon);
138+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[ALIGNED]   = PFX(addAvg_8x8_neon);
139+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[ALIGNED]  = PFX(addAvg_8x12_neon);
140+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[ALIGNED]  = PFX(addAvg_8x16_neon);
141+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[ALIGNED]  = PFX(addAvg_8x32_neon);
142+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[ALIGNED]  = PFX(addAvg_8x64_neon);
143+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[ALIGNED] = PFX(addAvg_12x32_neon);
144+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[ALIGNED]  = PFX(addAvg_16x8_neon);
145+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[ALIGNED] = PFX(addAvg_16x16_neon);
146+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[ALIGNED] = PFX(addAvg_16x24_neon);
147+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[ALIGNED] = PFX(addAvg_16x32_neon);
148+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[ALIGNED] = PFX(addAvg_16x64_neon);
149+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[ALIGNED] = PFX(addAvg_24x64_neon);
150+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[ALIGNED] = PFX(addAvg_32x16_neon);
151+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[ALIGNED] = PFX(addAvg_32x32_neon);
152+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[ALIGNED] = PFX(addAvg_32x48_neon);
153+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[ALIGNED] = PFX(addAvg_32x64_neon);
154
155         // quant
156          p.quant = PFX(quant_neon);
157@@ -402,7 +402,7 @@ void setupAssemblyPrimitives(EncoderPrim
158         p.scale2D_64to32  = PFX(scale2D_64to32_neon);
159
160         // scale1D_128to64
161-        p.scale1D_128to64 = PFX(scale1D_128to64_neon);
162+        p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
163
164         // copy_count
165         p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
166@@ -411,37 +411,37 @@ void setupAssemblyPrimitives(EncoderPrim
167         p.cu[BLOCK_32x32].copy_cnt   = PFX(copy_cnt_32_neon);
168
169         // filterPixelToShort
170-        p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
171-        p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
172-        p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
173-        p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
174-        p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
175-        p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
176-        p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
177-        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
178-        p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
179-        p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
180-        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
181-        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
182-        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
183-        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
184-        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
185-        p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
186-        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
187-        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
188-        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
189-        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
190-        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
191-        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
192-        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
193-        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
194-        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
195+        p.pu[LUMA_4x4].convert_p2s[ALIGNED]   = PFX(filterPixelToShort_4x4_neon);
196+        p.pu[LUMA_4x8].convert_p2s[ALIGNED]   = PFX(filterPixelToShort_4x8_neon);
197+        p.pu[LUMA_4x16].convert_p2s[ALIGNED]  = PFX(filterPixelToShort_4x16_neon);
198+        p.pu[LUMA_8x4].convert_p2s[ALIGNED]   = PFX(filterPixelToShort_8x4_neon);
199+        p.pu[LUMA_8x8].convert_p2s[ALIGNED]   = PFX(filterPixelToShort_8x8_neon);
200+        p.pu[LUMA_8x16].convert_p2s[ALIGNED]  = PFX(filterPixelToShort_8x16_neon);
201+        p.pu[LUMA_8x32].convert_p2s[ALIGNED]  = PFX(filterPixelToShort_8x32_neon);
202+        p.pu[LUMA_12x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_12x16_neon);
203+        p.pu[LUMA_16x4].convert_p2s[ALIGNED]  = PFX(filterPixelToShort_16x4_neon);
204+        p.pu[LUMA_16x8].convert_p2s[ALIGNED]  = PFX(filterPixelToShort_16x8_neon);
205+        p.pu[LUMA_16x12].convert_p2s[ALIGNED] = PFX(filterPixelToShort_16x12_neon);
206+        p.pu[LUMA_16x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_16x16_neon);
207+        p.pu[LUMA_16x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_16x32_neon);
208+        p.pu[LUMA_16x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_16x64_neon);
209+        p.pu[LUMA_24x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_24x32_neon);
210+        p.pu[LUMA_32x8].convert_p2s[ALIGNED]  = PFX(filterPixelToShort_32x8_neon);
211+        p.pu[LUMA_32x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_32x16_neon);
212+        p.pu[LUMA_32x24].convert_p2s[ALIGNED] = PFX(filterPixelToShort_32x24_neon);
213+        p.pu[LUMA_32x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_32x32_neon);
214+        p.pu[LUMA_32x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_32x64_neon);
215+        p.pu[LUMA_48x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_48x64_neon);
216+        p.pu[LUMA_64x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_64x16_neon);
217+        p.pu[LUMA_64x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_64x32_neon);
218+        p.pu[LUMA_64x48].convert_p2s[ALIGNED] = PFX(filterPixelToShort_64x48_neon);
219+        p.pu[LUMA_64x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_64x64_neon);
220
221         // Block_fill
222-        p.cu[BLOCK_4x4].blockfill_s   = PFX(blockfill_s_4x4_neon);
223-        p.cu[BLOCK_8x8].blockfill_s   = PFX(blockfill_s_8x8_neon);
224-        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);
225-        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);
226+        p.cu[BLOCK_4x4].blockfill_s[ALIGNED]   = PFX(blockfill_s_4x4_neon);
227+        p.cu[BLOCK_8x8].blockfill_s[ALIGNED]   = PFX(blockfill_s_8x8_neon);
228+        p.cu[BLOCK_16x16].blockfill_s[ALIGNED] = PFX(blockfill_s_16x16_neon);
229+        p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = PFX(blockfill_s_32x32_neon);
230
231         // Blockcopy_ss
232         p.cu[BLOCK_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
233@@ -495,21 +495,21 @@ void setupAssemblyPrimitives(EncoderPrim
234         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon);
235
236         // pixel_add_ps
237-        p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
238-        p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
239-        p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
240-        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
241-        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon);
242+        p.cu[BLOCK_4x4].add_ps[ALIGNED]   = PFX(pixel_add_ps_4x4_neon);
243+        p.cu[BLOCK_8x8].add_ps[ALIGNED]   = PFX(pixel_add_ps_8x8_neon);
244+        p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
245+        p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
246+        p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_neon);
247
248         // chroma add_ps
249-        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
250-        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
251-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
252-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
253-        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps   = PFX(pixel_add_ps_4x8_neon);
254-        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps  = PFX(pixel_add_ps_8x16_neon);
255-        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_neon);
256-        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_neon);
257+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED]   = PFX(pixel_add_ps_4x4_neon);
258+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED]   = PFX(pixel_add_ps_8x8_neon);
259+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
260+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
261+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED]   = PFX(pixel_add_ps_4x8_neon);
262+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED]  = PFX(pixel_add_ps_8x16_neon);
263+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = PFX(pixel_add_ps_16x32_neon);
264+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_32x64_neon);
265
266         // cpy2Dto1D_shr
267         p.cu[BLOCK_4x4].cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
268@@ -518,10 +518,10 @@ void setupAssemblyPrimitives(EncoderPrim
269         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
270
271         // ssd_s
272-        p.cu[BLOCK_4x4].ssd_s   = PFX(pixel_ssd_s_4x4_neon);
273-        p.cu[BLOCK_8x8].ssd_s   = PFX(pixel_ssd_s_8x8_neon);
274-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
275-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
276+        p.cu[BLOCK_4x4].ssd_s[ALIGNED]   = PFX(pixel_ssd_s_4x4_neon);
277+        p.cu[BLOCK_8x8].ssd_s[ALIGNED]   = PFX(pixel_ssd_s_8x8_neon);
278+        p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon);
279+        p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon);
280
281         // sse_ss
282         p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
283@@ -548,10 +548,10 @@ void setupAssemblyPrimitives(EncoderPrim
284         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
285
286         // calc_Residual
287-        p.cu[BLOCK_4x4].calcresidual   = PFX(getResidual4_neon);
288-        p.cu[BLOCK_8x8].calcresidual   = PFX(getResidual8_neon);
289-        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_neon);
290-        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_neon);
291+        p.cu[BLOCK_4x4].calcresidual[ALIGNED]   = PFX(getResidual4_neon);
292+        p.cu[BLOCK_8x8].calcresidual[ALIGNED]   = PFX(getResidual8_neon);
293+        p.cu[BLOCK_16x16].calcresidual[ALIGNED] = PFX(getResidual16_neon);
294+        p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual32_neon);
295
296         // sse_pp
297         p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_neon);
298@@ -722,31 +722,31 @@ void setupAssemblyPrimitives(EncoderPrim
299         p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon);
300
301         // pixel_avg_pp
302-        p.pu[LUMA_4x4].pixelavg_pp   = PFX(pixel_avg_pp_4x4_neon);
303-        p.pu[LUMA_4x8].pixelavg_pp   = PFX(pixel_avg_pp_4x8_neon);
304-        p.pu[LUMA_4x16].pixelavg_pp  = PFX(pixel_avg_pp_4x16_neon);
305-        p.pu[LUMA_8x4].pixelavg_pp   = PFX(pixel_avg_pp_8x4_neon);
306-        p.pu[LUMA_8x8].pixelavg_pp   = PFX(pixel_avg_pp_8x8_neon);
307-        p.pu[LUMA_8x16].pixelavg_pp  = PFX(pixel_avg_pp_8x16_neon);
308-        p.pu[LUMA_8x32].pixelavg_pp  = PFX(pixel_avg_pp_8x32_neon);
309-        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_pp_12x16_neon);
310-        p.pu[LUMA_16x4].pixelavg_pp  = PFX(pixel_avg_pp_16x4_neon);
311-        p.pu[LUMA_16x8].pixelavg_pp  = PFX(pixel_avg_pp_16x8_neon);
312-        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_pp_16x12_neon);
313-        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_pp_16x16_neon);
314-        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_pp_16x32_neon);
315-        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_pp_16x64_neon);
316-        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_pp_24x32_neon);
317-        p.pu[LUMA_32x8].pixelavg_pp  = PFX(pixel_avg_pp_32x8_neon);
318-        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_pp_32x16_neon);
319-        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_pp_32x24_neon);
320-        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_pp_32x32_neon);
321-        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_pp_32x64_neon);
322-        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_pp_48x64_neon);
323-        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_pp_64x16_neon);
324-        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_pp_64x32_neon);
325-        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_pp_64x48_neon);
326-        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_pp_64x64_neon);
327+        p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
328+        p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
329+        p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
330+        p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
331+        p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
332+        p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
333+        p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
334+        p.pu[LUMA_12x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_12x16_neon);
335+        p.pu[LUMA_16x4].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_16x4_neon);
336+        p.pu[LUMA_16x8].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_16x8_neon);
337+        p.pu[LUMA_16x12].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_16x12_neon);
338+        p.pu[LUMA_16x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_16x16_neon);
339+        p.pu[LUMA_16x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_16x32_neon);
340+        p.pu[LUMA_16x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_16x64_neon);
341+        p.pu[LUMA_24x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_24x32_neon);
342+        p.pu[LUMA_32x8].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_32x8_neon);
343+        p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_32x16_neon);
344+        p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_32x24_neon);
345+        p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_32x32_neon);
346+        p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_32x64_neon);
347+        p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_48x64_neon);
348+        p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_64x16_neon);
349+        p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_64x32_neon);
350+        p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_64x48_neon);
351+        p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_64x64_neon);
352
353         // planecopy
354         p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
355