Commit ad69471ce5e1284e1cacd053bb0fe8d6175a2f9e
1 parent
8f8e3aa4
ARM TCG conversion 14/16.
git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4151 c046a42c-6fe2-441c-8c8c-71466251a162
Showing
7 changed files
with
1779 additions
and
1718 deletions
Too many changes to show.
To preserve performance only 7 of 8 files are displayed.
Makefile.target
@@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o | @@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o | ||
211 | endif | 211 | endif |
212 | 212 | ||
213 | ifeq ($(TARGET_BASE_ARCH), arm) | 213 | ifeq ($(TARGET_BASE_ARCH), arm) |
214 | -LIBOBJS+= op_helper.o helper.o | 214 | +LIBOBJS+= op_helper.o helper.o neon_helper.o |
215 | endif | 215 | endif |
216 | 216 | ||
217 | ifeq ($(TARGET_BASE_ARCH), sh4) | 217 | ifeq ($(TARGET_BASE_ARCH), sh4) |
target-arm/helper.c
@@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env) | @@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env) | ||
256 | free(env); | 256 | free(env); |
257 | } | 257 | } |
258 | 258 | ||
259 | -/* Polynomial multiplication is like integer multiplcation except the | ||
260 | - partial products are XORed, not added. */ | ||
261 | -uint32_t helper_neon_mul_p8(uint32_t op1, uint32_t op2) | ||
262 | -{ | ||
263 | - uint32_t mask; | ||
264 | - uint32_t result; | ||
265 | - result = 0; | ||
266 | - while (op1) { | ||
267 | - mask = 0; | ||
268 | - if (op1 & 1) | ||
269 | - mask |= 0xff; | ||
270 | - if (op1 & (1 << 8)) | ||
271 | - mask |= (0xff << 8); | ||
272 | - if (op1 & (1 << 16)) | ||
273 | - mask |= (0xff << 16); | ||
274 | - if (op1 & (1 << 24)) | ||
275 | - mask |= (0xff << 24); | ||
276 | - result ^= op2 & mask; | ||
277 | - op1 = (op1 >> 1) & 0x7f7f7f7f; | ||
278 | - op2 = (op2 << 1) & 0xfefefefe; | ||
279 | - } | ||
280 | - return result; | ||
281 | -} | ||
282 | - | ||
283 | uint32_t cpsr_read(CPUARMState *env) | 259 | uint32_t cpsr_read(CPUARMState *env) |
284 | { | 260 | { |
285 | int ZF; | 261 | int ZF; |
@@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x) | @@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x) | ||
376 | return x; | 352 | return x; |
377 | } | 353 | } |
378 | 354 | ||
355 | +uint32_t HELPER(abs)(uint32_t x) | ||
356 | +{ | ||
357 | + return ((int32_t)x < 0) ? -x : x; | ||
358 | +} | ||
359 | + | ||
379 | #if defined(CONFIG_USER_ONLY) | 360 | #if defined(CONFIG_USER_ONLY) |
380 | 361 | ||
381 | void do_interrupt (CPUState *env) | 362 | void do_interrupt (CPUState *env) |
target-arm/helpers.h
@@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t)) | @@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t)) | ||
84 | DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t)) | 84 | DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t)) |
85 | DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t)) | 85 | DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t)) |
86 | DEF_HELPER_1_1(rbit, uint32_t, (uint32_t)) | 86 | DEF_HELPER_1_1(rbit, uint32_t, (uint32_t)) |
87 | +DEF_HELPER_1_1(abs, uint32_t, (uint32_t)) | ||
87 | 88 | ||
88 | #define PAS_OP(pfx) \ | 89 | #define PAS_OP(pfx) \ |
89 | DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \ | 90 | DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \ |
@@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *)) | @@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *)) | ||
208 | DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *)) | 209 | DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *)) |
209 | DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *)) | 210 | DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *)) |
210 | DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t)) | 211 | DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t)) |
212 | +DEF_HELPER_1_2(neon_add_saturate_u64, uint64_t, (uint64_t, uint64_t)) | ||
213 | +DEF_HELPER_1_2(neon_add_saturate_s64, uint64_t, (uint64_t, uint64_t)) | ||
214 | +DEF_HELPER_1_2(neon_sub_saturate_u64, uint64_t, (uint64_t, uint64_t)) | ||
215 | +DEF_HELPER_1_2(neon_sub_saturate_s64, uint64_t, (uint64_t, uint64_t)) | ||
211 | 216 | ||
212 | DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t)) | 217 | DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t)) |
213 | DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t)) | 218 | DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t)) |
@@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t)) | @@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t)) | ||
223 | DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t)) | 228 | DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t)) |
224 | DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t)) | 229 | DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t)) |
225 | 230 | ||
231 | +/* neon_helper.c */ | ||
232 | +DEF_HELPER_1_3(neon_qadd_u8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
233 | +DEF_HELPER_1_3(neon_qadd_s8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
234 | +DEF_HELPER_1_3(neon_qadd_u16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
235 | +DEF_HELPER_1_3(neon_qadd_s16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
236 | +DEF_HELPER_1_3(neon_qsub_u8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
237 | +DEF_HELPER_1_3(neon_qsub_s8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
238 | +DEF_HELPER_1_3(neon_qsub_u16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
239 | +DEF_HELPER_1_3(neon_qsub_s16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
240 | + | ||
241 | +DEF_HELPER_1_2(neon_hadd_s8, uint32_t, (uint32_t, uint32_t)) | ||
242 | +DEF_HELPER_1_2(neon_hadd_u8, uint32_t, (uint32_t, uint32_t)) | ||
243 | +DEF_HELPER_1_2(neon_hadd_s16, uint32_t, (uint32_t, uint32_t)) | ||
244 | +DEF_HELPER_1_2(neon_hadd_u16, uint32_t, (uint32_t, uint32_t)) | ||
245 | +DEF_HELPER_1_2(neon_hadd_s32, int32_t, (int32_t, int32_t)) | ||
246 | +DEF_HELPER_1_2(neon_hadd_u32, uint32_t, (uint32_t, uint32_t)) | ||
247 | +DEF_HELPER_1_2(neon_rhadd_s8, uint32_t, (uint32_t, uint32_t)) | ||
248 | +DEF_HELPER_1_2(neon_rhadd_u8, uint32_t, (uint32_t, uint32_t)) | ||
249 | +DEF_HELPER_1_2(neon_rhadd_s16, uint32_t, (uint32_t, uint32_t)) | ||
250 | +DEF_HELPER_1_2(neon_rhadd_u16, uint32_t, (uint32_t, uint32_t)) | ||
251 | +DEF_HELPER_1_2(neon_rhadd_s32, int32_t, (int32_t, int32_t)) | ||
252 | +DEF_HELPER_1_2(neon_rhadd_u32, uint32_t, (uint32_t, uint32_t)) | ||
253 | +DEF_HELPER_1_2(neon_hsub_s8, uint32_t, (uint32_t, uint32_t)) | ||
254 | +DEF_HELPER_1_2(neon_hsub_u8, uint32_t, (uint32_t, uint32_t)) | ||
255 | +DEF_HELPER_1_2(neon_hsub_s16, uint32_t, (uint32_t, uint32_t)) | ||
256 | +DEF_HELPER_1_2(neon_hsub_u16, uint32_t, (uint32_t, uint32_t)) | ||
257 | +DEF_HELPER_1_2(neon_hsub_s32, int32_t, (int32_t, int32_t)) | ||
258 | +DEF_HELPER_1_2(neon_hsub_u32, uint32_t, (uint32_t, uint32_t)) | ||
259 | + | ||
260 | +DEF_HELPER_1_2(neon_cgt_u8, uint32_t, (uint32_t, uint32_t)) | ||
261 | +DEF_HELPER_1_2(neon_cgt_s8, uint32_t, (uint32_t, uint32_t)) | ||
262 | +DEF_HELPER_1_2(neon_cgt_u16, uint32_t, (uint32_t, uint32_t)) | ||
263 | +DEF_HELPER_1_2(neon_cgt_s16, uint32_t, (uint32_t, uint32_t)) | ||
264 | +DEF_HELPER_1_2(neon_cgt_u32, uint32_t, (uint32_t, uint32_t)) | ||
265 | +DEF_HELPER_1_2(neon_cgt_s32, uint32_t, (uint32_t, uint32_t)) | ||
266 | +DEF_HELPER_1_2(neon_cge_u8, uint32_t, (uint32_t, uint32_t)) | ||
267 | +DEF_HELPER_1_2(neon_cge_s8, uint32_t, (uint32_t, uint32_t)) | ||
268 | +DEF_HELPER_1_2(neon_cge_u16, uint32_t, (uint32_t, uint32_t)) | ||
269 | +DEF_HELPER_1_2(neon_cge_s16, uint32_t, (uint32_t, uint32_t)) | ||
270 | +DEF_HELPER_1_2(neon_cge_u32, uint32_t, (uint32_t, uint32_t)) | ||
271 | +DEF_HELPER_1_2(neon_cge_s32, uint32_t, (uint32_t, uint32_t)) | ||
272 | + | ||
273 | +DEF_HELPER_1_2(neon_min_u8, uint32_t, (uint32_t, uint32_t)) | ||
274 | +DEF_HELPER_1_2(neon_min_s8, uint32_t, (uint32_t, uint32_t)) | ||
275 | +DEF_HELPER_1_2(neon_min_u16, uint32_t, (uint32_t, uint32_t)) | ||
276 | +DEF_HELPER_1_2(neon_min_s16, uint32_t, (uint32_t, uint32_t)) | ||
277 | +DEF_HELPER_1_2(neon_min_u32, uint32_t, (uint32_t, uint32_t)) | ||
278 | +DEF_HELPER_1_2(neon_min_s32, uint32_t, (uint32_t, uint32_t)) | ||
279 | +DEF_HELPER_1_2(neon_max_u8, uint32_t, (uint32_t, uint32_t)) | ||
280 | +DEF_HELPER_1_2(neon_max_s8, uint32_t, (uint32_t, uint32_t)) | ||
281 | +DEF_HELPER_1_2(neon_max_u16, uint32_t, (uint32_t, uint32_t)) | ||
282 | +DEF_HELPER_1_2(neon_max_s16, uint32_t, (uint32_t, uint32_t)) | ||
283 | +DEF_HELPER_1_2(neon_max_u32, uint32_t, (uint32_t, uint32_t)) | ||
284 | +DEF_HELPER_1_2(neon_max_s32, uint32_t, (uint32_t, uint32_t)) | ||
285 | +DEF_HELPER_1_2(neon_pmin_u8, uint32_t, (uint32_t, uint32_t)) | ||
286 | +DEF_HELPER_1_2(neon_pmin_s8, uint32_t, (uint32_t, uint32_t)) | ||
287 | +DEF_HELPER_1_2(neon_pmin_u16, uint32_t, (uint32_t, uint32_t)) | ||
288 | +DEF_HELPER_1_2(neon_pmin_s16, uint32_t, (uint32_t, uint32_t)) | ||
289 | +DEF_HELPER_1_2(neon_pmin_u32, uint32_t, (uint32_t, uint32_t)) | ||
290 | +DEF_HELPER_1_2(neon_pmin_s32, uint32_t, (uint32_t, uint32_t)) | ||
291 | +DEF_HELPER_1_2(neon_pmax_u8, uint32_t, (uint32_t, uint32_t)) | ||
292 | +DEF_HELPER_1_2(neon_pmax_s8, uint32_t, (uint32_t, uint32_t)) | ||
293 | +DEF_HELPER_1_2(neon_pmax_u16, uint32_t, (uint32_t, uint32_t)) | ||
294 | +DEF_HELPER_1_2(neon_pmax_s16, uint32_t, (uint32_t, uint32_t)) | ||
295 | +DEF_HELPER_1_2(neon_pmax_u32, uint32_t, (uint32_t, uint32_t)) | ||
296 | +DEF_HELPER_1_2(neon_pmax_s32, uint32_t, (uint32_t, uint32_t)) | ||
297 | + | ||
298 | +DEF_HELPER_1_2(neon_abd_u8, uint32_t, (uint32_t, uint32_t)) | ||
299 | +DEF_HELPER_1_2(neon_abd_s8, uint32_t, (uint32_t, uint32_t)) | ||
300 | +DEF_HELPER_1_2(neon_abd_u16, uint32_t, (uint32_t, uint32_t)) | ||
301 | +DEF_HELPER_1_2(neon_abd_s16, uint32_t, (uint32_t, uint32_t)) | ||
302 | +DEF_HELPER_1_2(neon_abd_u32, uint32_t, (uint32_t, uint32_t)) | ||
303 | +DEF_HELPER_1_2(neon_abd_s32, uint32_t, (uint32_t, uint32_t)) | ||
304 | + | ||
305 | +DEF_HELPER_1_2(neon_shl_u8, uint32_t, (uint32_t, uint32_t)) | ||
306 | +DEF_HELPER_1_2(neon_shl_s8, uint32_t, (uint32_t, uint32_t)) | ||
307 | +DEF_HELPER_1_2(neon_shl_u16, uint32_t, (uint32_t, uint32_t)) | ||
308 | +DEF_HELPER_1_2(neon_shl_s16, uint32_t, (uint32_t, uint32_t)) | ||
309 | +DEF_HELPER_1_2(neon_shl_u32, uint32_t, (uint32_t, uint32_t)) | ||
310 | +DEF_HELPER_1_2(neon_shl_s32, uint32_t, (uint32_t, uint32_t)) | ||
311 | +DEF_HELPER_1_2(neon_shl_u64, uint64_t, (uint64_t, uint64_t)) | ||
312 | +DEF_HELPER_1_2(neon_shl_s64, uint64_t, (uint64_t, uint64_t)) | ||
313 | +DEF_HELPER_1_2(neon_rshl_u8, uint32_t, (uint32_t, uint32_t)) | ||
314 | +DEF_HELPER_1_2(neon_rshl_s8, uint32_t, (uint32_t, uint32_t)) | ||
315 | +DEF_HELPER_1_2(neon_rshl_u16, uint32_t, (uint32_t, uint32_t)) | ||
316 | +DEF_HELPER_1_2(neon_rshl_s16, uint32_t, (uint32_t, uint32_t)) | ||
317 | +DEF_HELPER_1_2(neon_rshl_u32, uint32_t, (uint32_t, uint32_t)) | ||
318 | +DEF_HELPER_1_2(neon_rshl_s32, uint32_t, (uint32_t, uint32_t)) | ||
319 | +DEF_HELPER_1_2(neon_rshl_u64, uint64_t, (uint64_t, uint64_t)) | ||
320 | +DEF_HELPER_1_2(neon_rshl_s64, uint64_t, (uint64_t, uint64_t)) | ||
321 | +DEF_HELPER_1_3(neon_qshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
322 | +DEF_HELPER_1_3(neon_qshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
323 | +DEF_HELPER_1_3(neon_qshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
324 | +DEF_HELPER_1_3(neon_qshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
325 | +DEF_HELPER_1_3(neon_qshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
326 | +DEF_HELPER_1_3(neon_qshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
327 | +DEF_HELPER_1_3(neon_qshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t)) | ||
328 | +DEF_HELPER_1_3(neon_qshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t)) | ||
329 | +DEF_HELPER_1_3(neon_qrshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
330 | +DEF_HELPER_1_3(neon_qrshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
331 | +DEF_HELPER_1_3(neon_qrshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
332 | +DEF_HELPER_1_3(neon_qrshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
333 | +DEF_HELPER_1_3(neon_qrshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
334 | +DEF_HELPER_1_3(neon_qrshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
335 | +DEF_HELPER_1_3(neon_qrshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t)) | ||
336 | +DEF_HELPER_1_3(neon_qrshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t)) | ||
337 | + | ||
338 | +DEF_HELPER_1_2(neon_add_u8, uint32_t, (uint32_t, uint32_t)) | ||
339 | +DEF_HELPER_1_2(neon_add_u16, uint32_t, (uint32_t, uint32_t)) | ||
340 | +DEF_HELPER_1_2(neon_padd_u8, uint32_t, (uint32_t, uint32_t)) | ||
341 | +DEF_HELPER_1_2(neon_padd_u16, uint32_t, (uint32_t, uint32_t)) | ||
342 | +DEF_HELPER_1_2(neon_sub_u8, uint32_t, (uint32_t, uint32_t)) | ||
343 | +DEF_HELPER_1_2(neon_sub_u16, uint32_t, (uint32_t, uint32_t)) | ||
344 | +DEF_HELPER_1_2(neon_mul_u8, uint32_t, (uint32_t, uint32_t)) | ||
345 | +DEF_HELPER_1_2(neon_mul_u16, uint32_t, (uint32_t, uint32_t)) | ||
346 | +DEF_HELPER_1_2(neon_mul_p8, uint32_t, (uint32_t, uint32_t)) | ||
347 | + | ||
348 | +DEF_HELPER_1_2(neon_tst_u8, uint32_t, (uint32_t, uint32_t)) | ||
349 | +DEF_HELPER_1_2(neon_tst_u16, uint32_t, (uint32_t, uint32_t)) | ||
350 | +DEF_HELPER_1_2(neon_tst_u32, uint32_t, (uint32_t, uint32_t)) | ||
351 | +DEF_HELPER_1_2(neon_ceq_u8, uint32_t, (uint32_t, uint32_t)) | ||
352 | +DEF_HELPER_1_2(neon_ceq_u16, uint32_t, (uint32_t, uint32_t)) | ||
353 | +DEF_HELPER_1_2(neon_ceq_u32, uint32_t, (uint32_t, uint32_t)) | ||
354 | + | ||
355 | +DEF_HELPER_1_1(neon_abs_s8, uint32_t, (uint32_t)) | ||
356 | +DEF_HELPER_1_1(neon_abs_s16, uint32_t, (uint32_t)) | ||
357 | +DEF_HELPER_1_1(neon_clz_u8, uint32_t, (uint32_t)) | ||
358 | +DEF_HELPER_1_1(neon_clz_u16, uint32_t, (uint32_t)) | ||
359 | +DEF_HELPER_1_1(neon_cls_s8, uint32_t, (uint32_t)) | ||
360 | +DEF_HELPER_1_1(neon_cls_s16, uint32_t, (uint32_t)) | ||
361 | +DEF_HELPER_1_1(neon_cls_s32, uint32_t, (uint32_t)) | ||
362 | +DEF_HELPER_1_1(neon_cnt_u8, uint32_t, (uint32_t)) | ||
363 | + | ||
364 | +DEF_HELPER_1_3(neon_qdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
365 | +DEF_HELPER_1_3(neon_qrdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
366 | +DEF_HELPER_1_3(neon_qdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
367 | +DEF_HELPER_1_3(neon_qrdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t)) | ||
368 | + | ||
369 | +DEF_HELPER_1_1(neon_narrow_u8, uint32_t, (uint64_t)) | ||
370 | +DEF_HELPER_1_1(neon_narrow_u16, uint32_t, (uint64_t)) | ||
371 | +DEF_HELPER_1_2(neon_narrow_sat_u8, uint32_t, (CPUState *, uint64_t)) | ||
372 | +DEF_HELPER_1_2(neon_narrow_sat_s8, uint32_t, (CPUState *, uint64_t)) | ||
373 | +DEF_HELPER_1_2(neon_narrow_sat_u16, uint32_t, (CPUState *, uint64_t)) | ||
374 | +DEF_HELPER_1_2(neon_narrow_sat_s16, uint32_t, (CPUState *, uint64_t)) | ||
375 | +DEF_HELPER_1_2(neon_narrow_sat_u32, uint32_t, (CPUState *, uint64_t)) | ||
376 | +DEF_HELPER_1_2(neon_narrow_sat_s32, uint32_t, (CPUState *, uint64_t)) | ||
377 | +DEF_HELPER_1_1(neon_narrow_high_u8, uint32_t, (uint64_t)) | ||
378 | +DEF_HELPER_1_1(neon_narrow_high_u16, uint32_t, (uint64_t)) | ||
379 | +DEF_HELPER_1_1(neon_narrow_round_high_u8, uint32_t, (uint64_t)) | ||
380 | +DEF_HELPER_1_1(neon_narrow_round_high_u16, uint32_t, (uint64_t)) | ||
381 | +DEF_HELPER_1_1(neon_widen_u8, uint64_t, (uint32_t)) | ||
382 | +DEF_HELPER_1_1(neon_widen_s8, uint64_t, (uint32_t)) | ||
383 | +DEF_HELPER_1_1(neon_widen_u16, uint64_t, (uint32_t)) | ||
384 | +DEF_HELPER_1_1(neon_widen_s16, uint64_t, (uint32_t)) | ||
385 | + | ||
386 | +DEF_HELPER_1_2(neon_addl_u16, uint64_t, (uint64_t, uint64_t)) | ||
387 | +DEF_HELPER_1_2(neon_addl_u32, uint64_t, (uint64_t, uint64_t)) | ||
388 | +DEF_HELPER_1_2(neon_paddl_u16, uint64_t, (uint64_t, uint64_t)) | ||
389 | +DEF_HELPER_1_2(neon_paddl_u32, uint64_t, (uint64_t, uint64_t)) | ||
390 | +DEF_HELPER_1_2(neon_subl_u16, uint64_t, (uint64_t, uint64_t)) | ||
391 | +DEF_HELPER_1_2(neon_subl_u32, uint64_t, (uint64_t, uint64_t)) | ||
392 | +DEF_HELPER_1_3(neon_addl_saturate_s32, uint64_t, (CPUState *, uint64_t, uint64_t)) | ||
393 | +DEF_HELPER_1_3(neon_addl_saturate_s64, uint64_t, (CPUState *, uint64_t, uint64_t)) | ||
394 | +DEF_HELPER_1_2(neon_abdl_u16, uint64_t, (uint32_t, uint32_t)) | ||
395 | +DEF_HELPER_1_2(neon_abdl_s16, uint64_t, (uint32_t, uint32_t)) | ||
396 | +DEF_HELPER_1_2(neon_abdl_u32, uint64_t, (uint32_t, uint32_t)) | ||
397 | +DEF_HELPER_1_2(neon_abdl_s32, uint64_t, (uint32_t, uint32_t)) | ||
398 | +DEF_HELPER_1_2(neon_abdl_u64, uint64_t, (uint32_t, uint32_t)) | ||
399 | +DEF_HELPER_1_2(neon_abdl_s64, uint64_t, (uint32_t, uint32_t)) | ||
400 | +DEF_HELPER_1_2(neon_mull_u8, uint64_t, (uint32_t, uint32_t)) | ||
401 | +DEF_HELPER_1_2(neon_mull_s8, uint64_t, (uint32_t, uint32_t)) | ||
402 | +DEF_HELPER_1_2(neon_mull_u16, uint64_t, (uint32_t, uint32_t)) | ||
403 | +DEF_HELPER_1_2(neon_mull_s16, uint64_t, (uint32_t, uint32_t)) | ||
404 | + | ||
405 | +DEF_HELPER_1_1(neon_negl_u16, uint64_t, (uint64_t)) | ||
406 | +DEF_HELPER_1_1(neon_negl_u32, uint64_t, (uint64_t)) | ||
407 | +DEF_HELPER_1_1(neon_negl_u64, uint64_t, (uint64_t)) | ||
408 | + | ||
409 | +DEF_HELPER_1_2(neon_qabs_s8, uint32_t, (CPUState *, uint32_t)) | ||
410 | +DEF_HELPER_1_2(neon_qabs_s16, uint32_t, (CPUState *, uint32_t)) | ||
411 | +DEF_HELPER_1_2(neon_qabs_s32, uint32_t, (CPUState *, uint32_t)) | ||
412 | +DEF_HELPER_1_2(neon_qneg_s8, uint32_t, (CPUState *, uint32_t)) | ||
413 | +DEF_HELPER_1_2(neon_qneg_s16, uint32_t, (CPUState *, uint32_t)) | ||
414 | +DEF_HELPER_1_2(neon_qneg_s32, uint32_t, (CPUState *, uint32_t)) | ||
415 | + | ||
416 | +DEF_HELPER_0_0(neon_trn_u8, void, (void)) | ||
417 | +DEF_HELPER_0_0(neon_trn_u16, void, (void)) | ||
418 | +DEF_HELPER_0_0(neon_unzip_u8, void, (void)) | ||
419 | +DEF_HELPER_0_0(neon_zip_u8, void, (void)) | ||
420 | +DEF_HELPER_0_0(neon_zip_u16, void, (void)) | ||
421 | + | ||
422 | +DEF_HELPER_1_2(neon_min_f32, uint32_t, (uint32_t, uint32_t)) | ||
423 | +DEF_HELPER_1_2(neon_max_f32, uint32_t, (uint32_t, uint32_t)) | ||
424 | +DEF_HELPER_1_2(neon_abd_f32, uint32_t, (uint32_t, uint32_t)) | ||
425 | +DEF_HELPER_1_2(neon_add_f32, uint32_t, (uint32_t, uint32_t)) | ||
426 | +DEF_HELPER_1_2(neon_sub_f32, uint32_t, (uint32_t, uint32_t)) | ||
427 | +DEF_HELPER_1_2(neon_mul_f32, uint32_t, (uint32_t, uint32_t)) | ||
428 | +DEF_HELPER_1_2(neon_ceq_f32, uint32_t, (uint32_t, uint32_t)) | ||
429 | +DEF_HELPER_1_2(neon_cge_f32, uint32_t, (uint32_t, uint32_t)) | ||
430 | +DEF_HELPER_1_2(neon_cgt_f32, uint32_t, (uint32_t, uint32_t)) | ||
431 | +DEF_HELPER_1_2(neon_acge_f32, uint32_t, (uint32_t, uint32_t)) | ||
432 | +DEF_HELPER_1_2(neon_acgt_f32, uint32_t, (uint32_t, uint32_t)) | ||
433 | + | ||
226 | #undef DEF_HELPER | 434 | #undef DEF_HELPER |
227 | #undef DEF_HELPER_0_0 | 435 | #undef DEF_HELPER_0_0 |
228 | #undef DEF_HELPER_0_1 | 436 | #undef DEF_HELPER_0_1 |
target-arm/neon_helper.c
0 โ 100644
1 | +#include <stdlib.h> | ||
2 | +#include <stdio.h> | ||
3 | + | ||
4 | +#include "cpu.h" | ||
5 | +#include "exec-all.h" | ||
6 | +#include "helpers.h" | ||
7 | + | ||
8 | +#define SIGNBIT (uint32_t)0x80000000 | ||
9 | +#define SIGNBIT64 ((uint64_t)1 << 63) | ||
10 | + | ||
11 | +#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q | ||
12 | + | ||
13 | +static float_status neon_float_status; | ||
14 | +#define NFS &neon_float_status | ||
15 | + | ||
16 | +/* Helper routines to perform bitwise copies between float and int. */ | ||
17 | +static inline float32 vfp_itos(uint32_t i) | ||
18 | +{ | ||
19 | + union { | ||
20 | + uint32_t i; | ||
21 | + float32 s; | ||
22 | + } v; | ||
23 | + | ||
24 | + v.i = i; | ||
25 | + return v.s; | ||
26 | +} | ||
27 | + | ||
28 | +static inline uint32_t vfp_stoi(float32 s) | ||
29 | +{ | ||
30 | + union { | ||
31 | + uint32_t i; | ||
32 | + float32 s; | ||
33 | + } v; | ||
34 | + | ||
35 | + v.s = s; | ||
36 | + return v.i; | ||
37 | +} | ||
38 | + | ||
39 | +#define NEON_TYPE1(name, type) \ | ||
40 | +typedef struct \ | ||
41 | +{ \ | ||
42 | + type v1; \ | ||
43 | +} neon_##name; | ||
44 | +#ifdef WORDS_BIGENDIAN | ||
45 | +#define NEON_TYPE2(name, type) \ | ||
46 | +typedef struct \ | ||
47 | +{ \ | ||
48 | + type v2; \ | ||
49 | + type v1; \ | ||
50 | +} neon_##name; | ||
51 | +#define NEON_TYPE4(name, type) \ | ||
52 | +typedef struct \ | ||
53 | +{ \ | ||
54 | + type v4; \ | ||
55 | + type v3; \ | ||
56 | + type v2; \ | ||
57 | + type v1; \ | ||
58 | +} neon_##name; | ||
59 | +#else | ||
60 | +#define NEON_TYPE2(name, type) \ | ||
61 | +typedef struct \ | ||
62 | +{ \ | ||
63 | + type v1; \ | ||
64 | + type v2; \ | ||
65 | +} neon_##name; | ||
66 | +#define NEON_TYPE4(name, type) \ | ||
67 | +typedef struct \ | ||
68 | +{ \ | ||
69 | + type v1; \ | ||
70 | + type v2; \ | ||
71 | + type v3; \ | ||
72 | + type v4; \ | ||
73 | +} neon_##name; | ||
74 | +#endif | ||
75 | + | ||
76 | +NEON_TYPE4(s8, int8_t) | ||
77 | +NEON_TYPE4(u8, uint8_t) | ||
78 | +NEON_TYPE2(s16, int16_t) | ||
79 | +NEON_TYPE2(u16, uint16_t) | ||
80 | +NEON_TYPE1(s32, int32_t) | ||
81 | +NEON_TYPE1(u32, uint32_t) | ||
82 | +#undef NEON_TYPE4 | ||
83 | +#undef NEON_TYPE2 | ||
84 | +#undef NEON_TYPE1 | ||
85 | + | ||
86 | +/* Copy from a uint32_t to a vector structure type. */ | ||
87 | +#define NEON_UNPACK(vtype, dest, val) do { \ | ||
88 | + union { \ | ||
89 | + vtype v; \ | ||
90 | + uint32_t i; \ | ||
91 | + } conv_u; \ | ||
92 | + conv_u.i = (val); \ | ||
93 | + dest = conv_u.v; \ | ||
94 | + } while(0) | ||
95 | + | ||
96 | +/* Copy from a vector structure type to a uint32_t. */ | ||
97 | +#define NEON_PACK(vtype, dest, val) do { \ | ||
98 | + union { \ | ||
99 | + vtype v; \ | ||
100 | + uint32_t i; \ | ||
101 | + } conv_u; \ | ||
102 | + conv_u.v = (val); \ | ||
103 | + dest = conv_u.i; \ | ||
104 | + } while(0) | ||
105 | + | ||
106 | +#define NEON_DO1 \ | ||
107 | + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); | ||
108 | +#define NEON_DO2 \ | ||
109 | + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ | ||
110 | + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); | ||
111 | +#define NEON_DO4 \ | ||
112 | + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ | ||
113 | + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ | ||
114 | + NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ | ||
115 | + NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); | ||
116 | + | ||
117 | +#define NEON_VOP_BODY(vtype, n) \ | ||
118 | +{ \ | ||
119 | + uint32_t res; \ | ||
120 | + vtype vsrc1; \ | ||
121 | + vtype vsrc2; \ | ||
122 | + vtype vdest; \ | ||
123 | + NEON_UNPACK(vtype, vsrc1, arg1); \ | ||
124 | + NEON_UNPACK(vtype, vsrc2, arg2); \ | ||
125 | + NEON_DO##n; \ | ||
126 | + NEON_PACK(vtype, res, vdest); \ | ||
127 | + return res; \ | ||
128 | +} | ||
129 | + | ||
130 | +#define NEON_VOP(name, vtype, n) \ | ||
131 | +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ | ||
132 | +NEON_VOP_BODY(vtype, n) | ||
133 | + | ||
134 | +#define NEON_VOP_ENV(name, vtype, n) \ | ||
135 | +uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \ | ||
136 | +NEON_VOP_BODY(vtype, n) | ||
137 | + | ||
138 | +/* Pairwise operations. */ | ||
139 | +/* For 32-bit elements each segment only contains a single element, so | ||
140 | + the elementwise and pairwise operations are the same. */ | ||
141 | +#define NEON_PDO2 \ | ||
142 | + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ | ||
143 | + NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); | ||
144 | +#define NEON_PDO4 \ | ||
145 | + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ | ||
146 | + NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ | ||
147 | + NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ | ||
148 | + NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ | ||
149 | + | ||
150 | +#define NEON_POP(name, vtype, n) \ | ||
151 | +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ | ||
152 | +{ \ | ||
153 | + uint32_t res; \ | ||
154 | + vtype vsrc1; \ | ||
155 | + vtype vsrc2; \ | ||
156 | + vtype vdest; \ | ||
157 | + NEON_UNPACK(vtype, vsrc1, arg1); \ | ||
158 | + NEON_UNPACK(vtype, vsrc2, arg2); \ | ||
159 | + NEON_PDO##n; \ | ||
160 | + NEON_PACK(vtype, res, vdest); \ | ||
161 | + return res; \ | ||
162 | +} | ||
163 | + | ||
164 | +/* Unary operators. */ | ||
165 | +#define NEON_VOP1(name, vtype, n) \ | ||
166 | +uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ | ||
167 | +{ \ | ||
168 | + vtype vsrc1; \ | ||
169 | + vtype vdest; \ | ||
170 | + NEON_UNPACK(vtype, vsrc1, arg); \ | ||
171 | + NEON_DO##n; \ | ||
172 | + NEON_PACK(vtype, arg, vdest); \ | ||
173 | + return arg; \ | ||
174 | +} | ||
175 | + | ||
176 | + | ||
177 | +#define NEON_USAT(dest, src1, src2, type) do { \ | ||
178 | + uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ | ||
179 | + if (tmp != (type)tmp) { \ | ||
180 | + SET_QC(); \ | ||
181 | + dest = ~0; \ | ||
182 | + } else { \ | ||
183 | + dest = tmp; \ | ||
184 | + }} while(0) | ||
185 | +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) | ||
186 | +NEON_VOP_ENV(qadd_u8, neon_u8, 4) | ||
187 | +#undef NEON_FN | ||
188 | +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) | ||
189 | +NEON_VOP_ENV(qadd_u16, neon_u16, 2) | ||
190 | +#undef NEON_FN | ||
191 | +#undef NEON_USAT | ||
192 | + | ||
193 | +#define NEON_SSAT(dest, src1, src2, type) do { \ | ||
194 | + int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ | ||
195 | + if (tmp != (type)tmp) { \ | ||
196 | + SET_QC(); \ | ||
197 | + if (src2 > 0) { \ | ||
198 | + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ | ||
199 | + } else { \ | ||
200 | + tmp = 1 << (sizeof(type) * 8 - 1); \ | ||
201 | + } \ | ||
202 | + } \ | ||
203 | + dest = tmp; \ | ||
204 | + } while(0) | ||
205 | +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) | ||
206 | +NEON_VOP_ENV(qadd_s8, neon_s8, 4) | ||
207 | +#undef NEON_FN | ||
208 | +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) | ||
209 | +NEON_VOP_ENV(qadd_s16, neon_s16, 2) | ||
210 | +#undef NEON_FN | ||
211 | +#undef NEON_SSAT | ||
212 | + | ||
213 | +#define NEON_USAT(dest, src1, src2, type) do { \ | ||
214 | + uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ | ||
215 | + if (tmp != (type)tmp) { \ | ||
216 | + SET_QC(); \ | ||
217 | + dest = 0; \ | ||
218 | + } else { \ | ||
219 | + dest = tmp; \ | ||
220 | + }} while(0) | ||
221 | +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) | ||
222 | +NEON_VOP_ENV(qsub_u8, neon_u8, 4) | ||
223 | +#undef NEON_FN | ||
224 | +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) | ||
225 | +NEON_VOP_ENV(qsub_u16, neon_u16, 2) | ||
226 | +#undef NEON_FN | ||
227 | +#undef NEON_USAT | ||
228 | + | ||
229 | +#define NEON_SSAT(dest, src1, src2, type) do { \ | ||
230 | + int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ | ||
231 | + if (tmp != (type)tmp) { \ | ||
232 | + SET_QC(); \ | ||
233 | + if (src2 < 0) { \ | ||
234 | + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ | ||
235 | + } else { \ | ||
236 | + tmp = 1 << (sizeof(type) * 8 - 1); \ | ||
237 | + } \ | ||
238 | + } \ | ||
239 | + dest = tmp; \ | ||
240 | + } while(0) | ||
241 | +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) | ||
242 | +NEON_VOP_ENV(qsub_s8, neon_s8, 4) | ||
243 | +#undef NEON_FN | ||
244 | +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) | ||
245 | +NEON_VOP_ENV(qsub_s16, neon_s16, 2) | ||
246 | +#undef NEON_FN | ||
247 | +#undef NEON_SSAT | ||
248 | + | ||
249 | +#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 | ||
250 | +NEON_VOP(hadd_s8, neon_s8, 4) | ||
251 | +NEON_VOP(hadd_u8, neon_u8, 4) | ||
252 | +NEON_VOP(hadd_s16, neon_s16, 2) | ||
253 | +NEON_VOP(hadd_u16, neon_u16, 2) | ||
254 | +#undef NEON_FN | ||
255 | + | ||
256 | +int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) | ||
257 | +{ | ||
258 | + int32_t dest; | ||
259 | + | ||
260 | + dest = (src1 >> 1) + (src2 >> 1); | ||
261 | + if (src1 & src2 & 1) | ||
262 | + dest++; | ||
263 | + return dest; | ||
264 | +} | ||
265 | + | ||
266 | +uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) | ||
267 | +{ | ||
268 | + uint32_t dest; | ||
269 | + | ||
270 | + dest = (src1 >> 1) + (src2 >> 1); | ||
271 | + if (src1 & src2 & 1) | ||
272 | + dest++; | ||
273 | + return dest; | ||
274 | +} | ||
275 | + | ||
276 | +#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 | ||
277 | +NEON_VOP(rhadd_s8, neon_s8, 4) | ||
278 | +NEON_VOP(rhadd_u8, neon_u8, 4) | ||
279 | +NEON_VOP(rhadd_s16, neon_s16, 2) | ||
280 | +NEON_VOP(rhadd_u16, neon_u16, 2) | ||
281 | +#undef NEON_FN | ||
282 | + | ||
283 | +int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) | ||
284 | +{ | ||
285 | + int32_t dest; | ||
286 | + | ||
287 | + dest = (src1 >> 1) + (src2 >> 1); | ||
288 | + if ((src1 | src2) & 1) | ||
289 | + dest++; | ||
290 | + return dest; | ||
291 | +} | ||
292 | + | ||
293 | +uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) | ||
294 | +{ | ||
295 | + uint32_t dest; | ||
296 | + | ||
297 | + dest = (src1 >> 1) + (src2 >> 1); | ||
298 | + if ((src1 | src2) & 1) | ||
299 | + dest++; | ||
300 | + return dest; | ||
301 | +} | ||
302 | + | ||
303 | +#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 | ||
304 | +NEON_VOP(hsub_s8, neon_s8, 4) | ||
305 | +NEON_VOP(hsub_u8, neon_u8, 4) | ||
306 | +NEON_VOP(hsub_s16, neon_s16, 2) | ||
307 | +NEON_VOP(hsub_u16, neon_u16, 2) | ||
308 | +#undef NEON_FN | ||
309 | + | ||
310 | +int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) | ||
311 | +{ | ||
312 | + int32_t dest; | ||
313 | + | ||
314 | + dest = (src1 >> 1) - (src2 >> 1); | ||
315 | + if ((~src1) & src2 & 1) | ||
316 | + dest--; | ||
317 | + return dest; | ||
318 | +} | ||
319 | + | ||
320 | +uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) | ||
321 | +{ | ||
322 | + uint32_t dest; | ||
323 | + | ||
324 | + dest = (src1 >> 1) - (src2 >> 1); | ||
325 | + if ((~src1) & src2 & 1) | ||
326 | + dest--; | ||
327 | + return dest; | ||
328 | +} | ||
329 | + | ||
330 | +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 | ||
331 | +NEON_VOP(cgt_s8, neon_s8, 4) | ||
332 | +NEON_VOP(cgt_u8, neon_u8, 4) | ||
333 | +NEON_VOP(cgt_s16, neon_s16, 2) | ||
334 | +NEON_VOP(cgt_u16, neon_u16, 2) | ||
335 | +NEON_VOP(cgt_s32, neon_s32, 1) | ||
336 | +NEON_VOP(cgt_u32, neon_u32, 1) | ||
337 | +#undef NEON_FN | ||
338 | + | ||
339 | +#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 | ||
340 | +NEON_VOP(cge_s8, neon_s8, 4) | ||
341 | +NEON_VOP(cge_u8, neon_u8, 4) | ||
342 | +NEON_VOP(cge_s16, neon_s16, 2) | ||
343 | +NEON_VOP(cge_u16, neon_u16, 2) | ||
344 | +NEON_VOP(cge_s32, neon_s32, 1) | ||
345 | +NEON_VOP(cge_u32, neon_u32, 1) | ||
346 | +#undef NEON_FN | ||
347 | + | ||
348 | +#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 | ||
349 | +NEON_VOP(min_s8, neon_s8, 4) | ||
350 | +NEON_VOP(min_u8, neon_u8, 4) | ||
351 | +NEON_VOP(min_s16, neon_s16, 2) | ||
352 | +NEON_VOP(min_u16, neon_u16, 2) | ||
353 | +NEON_VOP(min_s32, neon_s32, 1) | ||
354 | +NEON_VOP(min_u32, neon_u32, 1) | ||
355 | +NEON_POP(pmin_s8, neon_s8, 4) | ||
356 | +NEON_POP(pmin_u8, neon_u8, 4) | ||
357 | +NEON_POP(pmin_s16, neon_s16, 2) | ||
358 | +NEON_POP(pmin_u16, neon_u16, 2) | ||
359 | +#undef NEON_FN | ||
360 | + | ||
361 | +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 | ||
362 | +NEON_VOP(max_s8, neon_s8, 4) | ||
363 | +NEON_VOP(max_u8, neon_u8, 4) | ||
364 | +NEON_VOP(max_s16, neon_s16, 2) | ||
365 | +NEON_VOP(max_u16, neon_u16, 2) | ||
366 | +NEON_VOP(max_s32, neon_s32, 1) | ||
367 | +NEON_VOP(max_u32, neon_u32, 1) | ||
368 | +NEON_POP(pmax_s8, neon_s8, 4) | ||
369 | +NEON_POP(pmax_u8, neon_u8, 4) | ||
370 | +NEON_POP(pmax_s16, neon_s16, 2) | ||
371 | +NEON_POP(pmax_u16, neon_u16, 2) | ||
372 | +#undef NEON_FN | ||
373 | + | ||
374 | +#define NEON_FN(dest, src1, src2) \ | ||
375 | + dest = (src1 > src2) ? (src1 - src2) : (src2 - src1) | ||
376 | +NEON_VOP(abd_s8, neon_s8, 4) | ||
377 | +NEON_VOP(abd_u8, neon_u8, 4) | ||
378 | +NEON_VOP(abd_s16, neon_s16, 2) | ||
379 | +NEON_VOP(abd_u16, neon_u16, 2) | ||
380 | +NEON_VOP(abd_s32, neon_s32, 1) | ||
381 | +NEON_VOP(abd_u32, neon_u32, 1) | ||
382 | +#undef NEON_FN | ||
383 | + | ||
384 | +#define NEON_FN(dest, src1, src2) do { \ | ||
385 | + int8_t tmp; \ | ||
386 | + tmp = (int8_t)src2; \ | ||
387 | + if (tmp >= sizeof(src1) * 8 || tmp <= -sizeof(src1) * 8) { \ | ||
388 | + dest = 0; \ | ||
389 | + } else if (tmp < 0) { \ | ||
390 | + dest = src1 >> -tmp; \ | ||
391 | + } else { \ | ||
392 | + dest = src1 << tmp; \ | ||
393 | + }} while (0) | ||
394 | +NEON_VOP(shl_u8, neon_u8, 4) | ||
395 | +NEON_VOP(shl_u16, neon_u16, 2) | ||
396 | +NEON_VOP(shl_u32, neon_u32, 1) | ||
397 | +#undef NEON_FN | ||
398 | + | ||
399 | +uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop) | ||
400 | +{ | ||
401 | + int8_t shift = (int8_t)shiftop; | ||
402 | + if (shift >= 64 || shift <= -64) { | ||
403 | + val = 0; | ||
404 | + } else if (shift < 0) { | ||
405 | + val >>= -shift; | ||
406 | + } else { | ||
407 | + val <<= shift; | ||
408 | + } | ||
409 | + return val; | ||
410 | +} | ||
411 | + | ||
412 | +#define NEON_FN(dest, src1, src2) do { \ | ||
413 | + int8_t tmp; \ | ||
414 | + tmp = (int8_t)src2; \ | ||
415 | + if (tmp >= sizeof(src1) * 8) { \ | ||
416 | + dest = 0; \ | ||
417 | + } else if (tmp <= -sizeof(src1) * 8) { \ | ||
418 | + dest = src1 >> (sizeof(src1) * 8 - 1); \ | ||
419 | + } else if (tmp < 0) { \ | ||
420 | + dest = src1 >> -tmp; \ | ||
421 | + } else { \ | ||
422 | + dest = src1 << tmp; \ | ||
423 | + }} while (0) | ||
424 | +NEON_VOP(shl_s8, neon_s8, 4) | ||
425 | +NEON_VOP(shl_s16, neon_s16, 2) | ||
426 | +NEON_VOP(shl_s32, neon_s32, 1) | ||
427 | +#undef NEON_FN | ||
428 | + | ||
429 | +uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) | ||
430 | +{ | ||
431 | + int8_t shift = (int8_t)shiftop; | ||
432 | + int64_t val = valop; | ||
433 | + if (shift >= 64) { | ||
434 | + val = 0; | ||
435 | + } else if (shift <= -64) { | ||
436 | + val >>= 63; | ||
437 | + } else if (shift < 0) { | ||
438 | + val >>= -shift; | ||
439 | + } else { | ||
440 | + val <<= shift; | ||
441 | + } | ||
442 | + return val; | ||
443 | +} | ||
444 | + | ||
445 | +#define NEON_FN(dest, src1, src2) do { \ | ||
446 | + int8_t tmp; \ | ||
447 | + tmp = (int8_t)src2; \ | ||
448 | + if (tmp >= sizeof(src1) * 8) { \ | ||
449 | + dest = 0; \ | ||
450 | + } else if (tmp < -sizeof(src1) * 8) { \ | ||
451 | + dest >>= sizeof(src1) * 8 - 1; \ | ||
452 | + } else if (tmp == -sizeof(src1) * 8) { \ | ||
453 | + dest = src1 >> (tmp - 1); \ | ||
454 | + dest++; \ | ||
455 | + src2 >>= 1; \ | ||
456 | + } else if (tmp < 0) { \ | ||
457 | + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ | ||
458 | + } else { \ | ||
459 | + dest = src1 << tmp; \ | ||
460 | + }} while (0) | ||
461 | +NEON_VOP(rshl_s8, neon_s8, 4) | ||
462 | +NEON_VOP(rshl_s16, neon_s16, 2) | ||
463 | +NEON_VOP(rshl_s32, neon_s32, 1) | ||
464 | +#undef NEON_FN | ||
465 | + | ||
466 | +uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) | ||
467 | +{ | ||
468 | + int8_t shift = (int8_t)shiftop; | ||
469 | + int64_t val = valop; | ||
470 | + if (shift >= 64) { | ||
471 | + val = 0; | ||
472 | + } else if (shift < -64) { | ||
473 | + val >>= 63; | ||
474 | + } else if (shift == -63) { | ||
475 | + val >>= 63; | ||
476 | + val++; | ||
477 | + val >>= 1; | ||
478 | + } else if (shift < 0) { | ||
479 | + val = (val + ((int64_t)1 << (-1 - shift))) >> -shift; | ||
480 | + } else { | ||
481 | + val <<= shift; | ||
482 | + } | ||
483 | + return val; | ||
484 | +} | ||
485 | + | ||
486 | +#define NEON_FN(dest, src1, src2) do { \ | ||
487 | + int8_t tmp; \ | ||
488 | + tmp = (int8_t)src2; \ | ||
489 | + if (tmp >= sizeof(src1) * 8 || tmp < -sizeof(src1) * 8) { \ | ||
490 | + dest = 0; \ | ||
491 | + } else if (tmp == -sizeof(src1) * 8) { \ | ||
492 | + dest = src1 >> (tmp - 1); \ | ||
493 | + } else if (tmp < 0) { \ | ||
494 | + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ | ||
495 | + } else { \ | ||
496 | + dest = src1 << tmp; \ | ||
497 | + }} while (0) | ||
498 | +NEON_VOP(rshl_u8, neon_u8, 4) | ||
499 | +NEON_VOP(rshl_u16, neon_u16, 2) | ||
500 | +NEON_VOP(rshl_u32, neon_u32, 1) | ||
501 | +#undef NEON_FN | ||
502 | + | ||
503 | +uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) | ||
504 | +{ | ||
505 | + int8_t shift = (uint8_t)shiftop; | ||
506 | + if (shift >= 64 || shift < 64) { | ||
507 | + val = 0; | ||
508 | + } else if (shift == -64) { | ||
509 | + /* Rounding a 1-bit result just preserves that bit. */ | ||
510 | + val >>= 63; | ||
511 | + } if (shift < 0) { | ||
512 | + val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift; | ||
513 | + val >>= -shift; | ||
514 | + } else { | ||
515 | + val <<= shift; | ||
516 | + } | ||
517 | + return val; | ||
518 | +} | ||
519 | + | ||
520 | +#define NEON_FN(dest, src1, src2) do { \ | ||
521 | + int8_t tmp; \ | ||
522 | + tmp = (int8_t)src2; \ | ||
523 | + if (tmp >= sizeof(src1) * 8) { \ | ||
524 | + if (src1) { \ | ||
525 | + SET_QC(); \ | ||
526 | + dest = ~0; \ | ||
527 | + } else { \ | ||
528 | + dest = 0; \ | ||
529 | + } \ | ||
530 | + } else if (tmp <= -sizeof(src1) * 8) { \ | ||
531 | + dest = 0; \ | ||
532 | + } else if (tmp < 0) { \ | ||
533 | + dest = src1 >> -tmp; \ | ||
534 | + } else { \ | ||
535 | + dest = src1 << tmp; \ | ||
536 | + if ((dest >> tmp) != src1) { \ | ||
537 | + SET_QC(); \ | ||
538 | + dest = ~0; \ | ||
539 | + } \ | ||
540 | + }} while (0) | ||
541 | +NEON_VOP_ENV(qshl_u8, neon_u8, 4) | ||
542 | +NEON_VOP_ENV(qshl_u16, neon_u16, 2) | ||
543 | +NEON_VOP_ENV(qshl_u32, neon_u32, 1) | ||
544 | +#undef NEON_FN | ||
545 | + | ||
546 | +uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) | ||
547 | +{ | ||
548 | + int8_t shift = (int8_t)shiftop; | ||
549 | + if (shift >= 64) { | ||
550 | + if (val) { | ||
551 | + val = ~(uint64_t)0; | ||
552 | + SET_QC(); | ||
553 | + } else { | ||
554 | + val = 0; | ||
555 | + } | ||
556 | + } else if (shift <= -64) { | ||
557 | + val = 0; | ||
558 | + } else if (shift < 0) { | ||
559 | + val >>= -shift; | ||
560 | + } else { | ||
561 | + uint64_t tmp = val; | ||
562 | + val <<= shift; | ||
563 | + if ((val >> shift) != tmp) { | ||
564 | + SET_QC(); | ||
565 | + val = ~(uint64_t)0; | ||
566 | + } | ||
567 | + } | ||
568 | + return val; | ||
569 | +} | ||
570 | + | ||
571 | +#define NEON_FN(dest, src1, src2) do { \ | ||
572 | + int8_t tmp; \ | ||
573 | + tmp = (int8_t)src2; \ | ||
574 | + if (tmp >= sizeof(src1) * 8) { \ | ||
575 | + if (src1) \ | ||
576 | + SET_QC(); \ | ||
577 | + dest = src1 >> 31; \ | ||
578 | + } else if (tmp <= -sizeof(src1) * 8) { \ | ||
579 | + dest = src1 >> 31; \ | ||
580 | + } else if (tmp < 0) { \ | ||
581 | + dest = src1 >> -tmp; \ | ||
582 | + } else { \ | ||
583 | + dest = src1 << tmp; \ | ||
584 | + if ((dest >> tmp) != src1) { \ | ||
585 | + SET_QC(); \ | ||
586 | + dest = src2 >> 31; \ | ||
587 | + } \ | ||
588 | + }} while (0) | ||
589 | +NEON_VOP_ENV(qshl_s8, neon_s8, 4) | ||
590 | +NEON_VOP_ENV(qshl_s16, neon_s16, 2) | ||
591 | +NEON_VOP_ENV(qshl_s32, neon_s32, 1) | ||
592 | +#undef NEON_FN | ||
593 | + | ||
594 | +uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) | ||
595 | +{ | ||
596 | + int8_t shift = (uint8_t)shiftop; | ||
597 | + int64_t val = valop; | ||
598 | + if (shift >= 64) { | ||
599 | + if (val) { | ||
600 | + SET_QC(); | ||
601 | + val = (val >> 63) & ~SIGNBIT64; | ||
602 | + } | ||
603 | + } else if (shift <= 64) { | ||
604 | + val >>= 63; | ||
605 | + } else if (shift < 0) { | ||
606 | + val >>= -shift; | ||
607 | + } else { | ||
608 | + int64_t tmp = val; | ||
609 | + val <<= shift; | ||
610 | + if ((val >> shift) != tmp) { | ||
611 | + SET_QC(); | ||
612 | + val = (tmp >> 63) ^ ~SIGNBIT64; | ||
613 | + } | ||
614 | + } | ||
615 | + return val; | ||
616 | +} | ||
617 | + | ||
618 | + | ||
619 | +/* FIXME: This is wrong. */ | ||
620 | +#define NEON_FN(dest, src1, src2) do { \ | ||
621 | + int8_t tmp; \ | ||
622 | + tmp = (int8_t)src2; \ | ||
623 | + if (tmp < 0) { \ | ||
624 | + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ | ||
625 | + } else { \ | ||
626 | + dest = src1 << tmp; \ | ||
627 | + if ((dest >> tmp) != src1) { \ | ||
628 | + SET_QC(); \ | ||
629 | + dest = ~0; \ | ||
630 | + } \ | ||
631 | + }} while (0) | ||
632 | +NEON_VOP_ENV(qrshl_u8, neon_u8, 4) | ||
633 | +NEON_VOP_ENV(qrshl_u16, neon_u16, 2) | ||
634 | +NEON_VOP_ENV(qrshl_u32, neon_u32, 1) | ||
635 | +#undef NEON_FN | ||
636 | + | ||
637 | +uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) | ||
638 | +{ | ||
639 | + int8_t shift = (int8_t)shiftop; | ||
640 | + if (shift < 0) { | ||
641 | + val = (val + (1 << (-1 - shift))) >> -shift; | ||
642 | + } else { \ | ||
643 | + uint64_t tmp = val; | ||
644 | + val <<= shift; | ||
645 | + if ((val >> shift) != tmp) { | ||
646 | + SET_QC(); | ||
647 | + val = ~0; | ||
648 | + } | ||
649 | + } | ||
650 | + return val; | ||
651 | +} | ||
652 | + | ||
653 | +#define NEON_FN(dest, src1, src2) do { \ | ||
654 | + int8_t tmp; \ | ||
655 | + tmp = (int8_t)src2; \ | ||
656 | + if (tmp < 0) { \ | ||
657 | + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ | ||
658 | + } else { \ | ||
659 | + dest = src1 << tmp; \ | ||
660 | + if ((dest >> tmp) != src1) { \ | ||
661 | + SET_QC(); \ | ||
662 | + dest = src1 >> 31; \ | ||
663 | + } \ | ||
664 | + }} while (0) | ||
665 | +NEON_VOP_ENV(qrshl_s8, neon_s8, 4) | ||
666 | +NEON_VOP_ENV(qrshl_s16, neon_s16, 2) | ||
667 | +NEON_VOP_ENV(qrshl_s32, neon_s32, 1) | ||
668 | +#undef NEON_FN | ||
669 | + | ||
670 | +uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) | ||
671 | +{ | ||
672 | + int8_t shift = (uint8_t)shiftop; | ||
673 | + int64_t val = valop; | ||
674 | + | ||
675 | + if (shift < 0) { | ||
676 | + val = (val + (1 << (-1 - shift))) >> -shift; | ||
677 | + } else { | ||
678 | + int64_t tmp = val;; | ||
679 | + val <<= shift; | ||
680 | + if ((val >> shift) != tmp) { | ||
681 | + SET_QC(); | ||
682 | + val = tmp >> 31; | ||
683 | + } | ||
684 | + } | ||
685 | + return val; | ||
686 | +} | ||
687 | + | ||
688 | +uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) | ||
689 | +{ | ||
690 | + uint32_t mask; | ||
691 | + mask = (a ^ b) & 0x80808080u; | ||
692 | + a &= ~0x80808080u; | ||
693 | + b &= ~0x80808080u; | ||
694 | + return (a + b) ^ mask; | ||
695 | +} | ||
696 | + | ||
697 | +uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) | ||
698 | +{ | ||
699 | + uint32_t mask; | ||
700 | + mask = (a ^ b) & 0x80008000u; | ||
701 | + a &= ~0x80008000u; | ||
702 | + b &= ~0x80008000u; | ||
703 | + return (a + b) ^ mask; | ||
704 | +} | ||
705 | + | ||
706 | +#define NEON_FN(dest, src1, src2) dest = src1 + src2 | ||
707 | +NEON_POP(padd_u8, neon_u8, 4) | ||
708 | +NEON_POP(padd_u16, neon_u16, 2) | ||
709 | +#undef NEON_FN | ||
710 | + | ||
711 | +#define NEON_FN(dest, src1, src2) dest = src1 - src2 | ||
712 | +NEON_VOP(sub_u8, neon_u8, 4) | ||
713 | +NEON_VOP(sub_u16, neon_u16, 2) | ||
714 | +#undef NEON_FN | ||
715 | + | ||
716 | +#define NEON_FN(dest, src1, src2) dest = src1 * src2 | ||
717 | +NEON_VOP(mul_u8, neon_u8, 4) | ||
718 | +NEON_VOP(mul_u16, neon_u16, 2) | ||
719 | +#undef NEON_FN | ||
720 | + | ||
721 | +/* Polynomial multiplication is like integer multiplcation except the | ||
722 | + partial products are XORed, not added. */ | ||
723 | +uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2) | ||
724 | +{ | ||
725 | + uint32_t mask; | ||
726 | + uint32_t result; | ||
727 | + result = 0; | ||
728 | + while (op1) { | ||
729 | + mask = 0; | ||
730 | + if (op1 & 1) | ||
731 | + mask |= 0xff; | ||
732 | + if (op1 & (1 << 8)) | ||
733 | + mask |= (0xff << 8); | ||
734 | + if (op1 & (1 << 16)) | ||
735 | + mask |= (0xff << 16); | ||
736 | + if (op1 & (1 << 24)) | ||
737 | + mask |= (0xff << 24); | ||
738 | + result ^= op2 & mask; | ||
739 | + op1 = (op1 >> 1) & 0x7f7f7f7f; | ||
740 | + op2 = (op2 << 1) & 0xfefefefe; | ||
741 | + } | ||
742 | + return result; | ||
743 | +} | ||
744 | + | ||
745 | +#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 | ||
746 | +NEON_VOP(tst_u8, neon_u8, 4) | ||
747 | +NEON_VOP(tst_u16, neon_u16, 2) | ||
748 | +NEON_VOP(tst_u32, neon_u32, 1) | ||
749 | +#undef NEON_FN | ||
750 | + | ||
751 | +#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0 | ||
752 | +NEON_VOP(ceq_u8, neon_u8, 4) | ||
753 | +NEON_VOP(ceq_u16, neon_u16, 2) | ||
754 | +NEON_VOP(ceq_u32, neon_u32, 1) | ||
755 | +#undef NEON_FN | ||
756 | + | ||
757 | +#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src | ||
758 | +NEON_VOP1(abs_s8, neon_s8, 4) | ||
759 | +NEON_VOP1(abs_s16, neon_s16, 2) | ||
760 | +#undef NEON_FN | ||
761 | + | ||
762 | +/* Count Leading Sign/Zero Bits. */ | ||
763 | +static inline int do_clz8(uint8_t x) | ||
764 | +{ | ||
765 | + int n; | ||
766 | + for (n = 8; x; n--) | ||
767 | + x >>= 1; | ||
768 | + return n; | ||
769 | +} | ||
770 | + | ||
771 | +static inline int do_clz16(uint16_t x) | ||
772 | +{ | ||
773 | + int n; | ||
774 | + for (n = 16; x; n--) | ||
775 | + x >>= 1; | ||
776 | + return n; | ||
777 | +} | ||
778 | + | ||
779 | +#define NEON_FN(dest, src, dummy) dest = do_clz8(src) | ||
780 | +NEON_VOP1(clz_u8, neon_u8, 4) | ||
781 | +#undef NEON_FN | ||
782 | + | ||
783 | +#define NEON_FN(dest, src, dummy) dest = do_clz16(src) | ||
784 | +NEON_VOP1(clz_u16, neon_u16, 2) | ||
785 | +#undef NEON_FN | ||
786 | + | ||
787 | +#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 | ||
788 | +NEON_VOP1(cls_s8, neon_s8, 4) | ||
789 | +#undef NEON_FN | ||
790 | + | ||
791 | +#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 | ||
792 | +NEON_VOP1(cls_s16, neon_s16, 2) | ||
793 | +#undef NEON_FN | ||
794 | + | ||
795 | +uint32_t HELPER(neon_cls_s32)(uint32_t x) | ||
796 | +{ | ||
797 | + int count; | ||
798 | + if ((int32_t)x < 0) | ||
799 | + x = ~x; | ||
800 | + for (count = 32; x; count--) | ||
801 | + x = x >> 1; | ||
802 | + return count - 1; | ||
803 | +} | ||
804 | + | ||
805 | +/* Bit count. */ | ||
806 | +uint32_t HELPER(neon_cnt_u8)(uint32_t x) | ||
807 | +{ | ||
808 | + x = (x & 0x55555555) + ((x >> 1) & 0x55555555); | ||
809 | + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); | ||
810 | + x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); | ||
811 | + return x; | ||
812 | +} | ||
813 | + | ||
814 | +#define NEON_QDMULH16(dest, src1, src2, round) do { \ | ||
815 | + uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ | ||
816 | + if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ | ||
817 | + SET_QC(); \ | ||
818 | + tmp = (tmp >> 31) ^ ~SIGNBIT; \ | ||
819 | + } \ | ||
820 | + tmp <<= 1; \ | ||
821 | + if (round) { \ | ||
822 | + int32_t old = tmp; \ | ||
823 | + tmp += 1 << 15; \ | ||
824 | + if ((int32_t)tmp < old) { \ | ||
825 | + SET_QC(); \ | ||
826 | + tmp = SIGNBIT - 1; \ | ||
827 | + } \ | ||
828 | + } \ | ||
829 | + dest = tmp >> 16; \ | ||
830 | + } while(0) | ||
831 | +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) | ||
832 | +NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) | ||
833 | +#undef NEON_FN | ||
834 | +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) | ||
835 | +NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) | ||
836 | +#undef NEON_FN | ||
837 | +#undef NEON_QDMULH16 | ||
838 | + | ||
839 | +#define NEON_QDMULH32(dest, src1, src2, round) do { \ | ||
840 | + uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ | ||
841 | + if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ | ||
842 | + SET_QC(); \ | ||
843 | + tmp = (tmp >> 63) ^ ~SIGNBIT64; \ | ||
844 | + } else { \ | ||
845 | + tmp <<= 1; \ | ||
846 | + } \ | ||
847 | + if (round) { \ | ||
848 | + int64_t old = tmp; \ | ||
849 | + tmp += (int64_t)1 << 31; \ | ||
850 | + if ((int64_t)tmp < old) { \ | ||
851 | + SET_QC(); \ | ||
852 | + tmp = SIGNBIT64 - 1; \ | ||
853 | + } \ | ||
854 | + } \ | ||
855 | + dest = tmp >> 32; \ | ||
856 | + } while(0) | ||
857 | +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) | ||
858 | +NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) | ||
859 | +#undef NEON_FN | ||
860 | +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) | ||
861 | +NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) | ||
862 | +#undef NEON_FN | ||
863 | +#undef NEON_QDMULH32 | ||
864 | + | ||
865 | +uint32_t HELPER(neon_narrow_u8)(uint64_t x) | ||
866 | +{ | ||
867 | + return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) | ||
868 | + | ((x >> 24) & 0xff000000u); | ||
869 | +} | ||
870 | + | ||
871 | +uint32_t HELPER(neon_narrow_u16)(uint64_t x) | ||
872 | +{ | ||
873 | + return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); | ||
874 | +} | ||
875 | + | ||
876 | +uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) | ||
877 | +{ | ||
878 | + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) | ||
879 | + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); | ||
880 | +} | ||
881 | + | ||
882 | +uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) | ||
883 | +{ | ||
884 | + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); | ||
885 | +} | ||
886 | + | ||
887 | +uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) | ||
888 | +{ | ||
889 | + x &= 0xff80ff80ff80ff80ull; | ||
890 | + x += 0x0080008000800080ull; | ||
891 | + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) | ||
892 | + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); | ||
893 | +} | ||
894 | + | ||
895 | +uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) | ||
896 | +{ | ||
897 | + x &= 0xffff8000ffff8000ull; | ||
898 | + x += 0x0000800000008000ull; | ||
899 | + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); | ||
900 | +} | ||
901 | + | ||
902 | +uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x) | ||
903 | +{ | ||
904 | + uint16_t s; | ||
905 | + uint8_t d; | ||
906 | + uint32_t res = 0; | ||
907 | +#define SAT8(n) \ | ||
908 | + s = x >> n; \ | ||
909 | + if (s > 0xff) { \ | ||
910 | + d = 0xff; \ | ||
911 | + SET_QC(); \ | ||
912 | + } else { \ | ||
913 | + d = s; \ | ||
914 | + } \ | ||
915 | + res |= (uint32_t)d << (n / 2); | ||
916 | + | ||
917 | + SAT8(0); | ||
918 | + SAT8(16); | ||
919 | + SAT8(32); | ||
920 | + SAT8(48); | ||
921 | +#undef SAT8 | ||
922 | + return res; | ||
923 | +} | ||
924 | + | ||
925 | +uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x) | ||
926 | +{ | ||
927 | + int16_t s; | ||
928 | + uint8_t d; | ||
929 | + uint32_t res = 0; | ||
930 | +#define SAT8(n) \ | ||
931 | + s = x >> n; \ | ||
932 | + if (s != (int8_t)s) { \ | ||
933 | + d = (s >> 15) ^ 0x7f; \ | ||
934 | + SET_QC(); \ | ||
935 | + } else { \ | ||
936 | + d = s; \ | ||
937 | + } \ | ||
938 | + res |= (uint32_t)d << (n / 2); | ||
939 | + | ||
940 | + SAT8(0); | ||
941 | + SAT8(16); | ||
942 | + SAT8(32); | ||
943 | + SAT8(48); | ||
944 | +#undef SAT8 | ||
945 | + return res; | ||
946 | +} | ||
947 | + | ||
948 | +uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x) | ||
949 | +{ | ||
950 | + uint32_t high; | ||
951 | + uint32_t low; | ||
952 | + low = x; | ||
953 | + if (low > 0xffff) { | ||
954 | + low = 0xffff; | ||
955 | + SET_QC(); | ||
956 | + } | ||
957 | + high = x >> 32; | ||
958 | + if (high > 0xffff) { | ||
959 | + high = 0xffff; | ||
960 | + SET_QC(); | ||
961 | + } | ||
962 | + return low | (high << 16); | ||
963 | +} | ||
964 | + | ||
965 | +uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x) | ||
966 | +{ | ||
967 | + int32_t low; | ||
968 | + int32_t high; | ||
969 | + low = x; | ||
970 | + if (low != (int16_t)low) { | ||
971 | + low = (low >> 31) ^ 0x7fff; | ||
972 | + SET_QC(); | ||
973 | + } | ||
974 | + high = x >> 32; | ||
975 | + if (high != (int16_t)high) { | ||
976 | + high = (high >> 31) ^ 0x7fff; | ||
977 | + SET_QC(); | ||
978 | + } | ||
979 | + return (uint16_t)low | (high << 16); | ||
980 | +} | ||
981 | + | ||
982 | +uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x) | ||
983 | +{ | ||
984 | + if (x > 0xffffffffu) { | ||
985 | + SET_QC(); | ||
986 | + return 0xffffffffu; | ||
987 | + } | ||
988 | + return x; | ||
989 | +} | ||
990 | + | ||
991 | +uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x) | ||
992 | +{ | ||
993 | + if ((int64_t)x != (int32_t)x) { | ||
994 | + SET_QC(); | ||
995 | + return (x >> 63) ^ 0x7fffffff; | ||
996 | + } | ||
997 | + return x; | ||
998 | +} | ||
999 | + | ||
1000 | +uint64_t HELPER(neon_widen_u8)(uint32_t x) | ||
1001 | +{ | ||
1002 | + uint64_t tmp; | ||
1003 | + uint64_t ret; | ||
1004 | + ret = (uint8_t)x; | ||
1005 | + tmp = (uint8_t)(x >> 8); | ||
1006 | + ret |= tmp << 16; | ||
1007 | + tmp = (uint8_t)(x >> 16); | ||
1008 | + ret |= tmp << 32; | ||
1009 | + tmp = (uint8_t)(x >> 24); | ||
1010 | + ret |= tmp << 48; | ||
1011 | + return ret; | ||
1012 | +} | ||
1013 | + | ||
1014 | +uint64_t HELPER(neon_widen_s8)(uint32_t x) | ||
1015 | +{ | ||
1016 | + uint64_t tmp; | ||
1017 | + uint64_t ret; | ||
1018 | + ret = (uint16_t)(int8_t)x; | ||
1019 | + tmp = (uint16_t)(int8_t)(x >> 8); | ||
1020 | + ret |= tmp << 16; | ||
1021 | + tmp = (uint16_t)(int8_t)(x >> 16); | ||
1022 | + ret |= tmp << 32; | ||
1023 | + tmp = (uint16_t)(int8_t)(x >> 24); | ||
1024 | + ret |= tmp << 48; | ||
1025 | + return ret; | ||
1026 | +} | ||
1027 | + | ||
1028 | +uint64_t HELPER(neon_widen_u16)(uint32_t x) | ||
1029 | +{ | ||
1030 | + uint64_t high = (uint16_t)(x >> 16); | ||
1031 | + return ((uint16_t)x) | (high << 32); | ||
1032 | +} | ||
1033 | + | ||
1034 | +uint64_t HELPER(neon_widen_s16)(uint32_t x) | ||
1035 | +{ | ||
1036 | + uint64_t high = (int16_t)(x >> 16); | ||
1037 | + return ((uint32_t)(int16_t)x) | (high << 32); | ||
1038 | +} | ||
1039 | + | ||
1040 | +uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) | ||
1041 | +{ | ||
1042 | + uint64_t mask; | ||
1043 | + mask = (a ^ b) & 0x8000800080008000ull; | ||
1044 | + a &= ~0x8000800080008000ull; | ||
1045 | + b &= ~0x8000800080008000ull; | ||
1046 | + return (a + b) ^ mask; | ||
1047 | +} | ||
1048 | + | ||
1049 | +uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) | ||
1050 | +{ | ||
1051 | + uint64_t mask; | ||
1052 | + mask = (a ^ b) & 0x8000000080000000ull; | ||
1053 | + a &= ~0x8000000080000000ull; | ||
1054 | + b &= ~0x8000000080000000ull; | ||
1055 | + return (a + b) ^ mask; | ||
1056 | +} | ||
1057 | + | ||
1058 | +uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) | ||
1059 | +{ | ||
1060 | + uint64_t tmp; | ||
1061 | + uint64_t tmp2; | ||
1062 | + | ||
1063 | + tmp = a & 0x0000ffff0000ffffull; | ||
1064 | + tmp += (a >> 16) & 0x0000ffff0000ffffull; | ||
1065 | + tmp2 = b & 0xffff0000ffff0000ull; | ||
1066 | + tmp2 += (b << 16) & 0xffff0000ffff0000ull; | ||
1067 | + return ( tmp & 0xffff) | ||
1068 | + | ((tmp >> 16) & 0xffff0000ull) | ||
1069 | + | ((tmp2 << 16) & 0xffff00000000ull) | ||
1070 | + | ( tmp2 & 0xffff000000000000ull); | ||
1071 | +} | ||
1072 | + | ||
1073 | +uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) | ||
1074 | +{ | ||
1075 | + uint32_t low = a + (a >> 32); | ||
1076 | + uint32_t high = b + (b >> 32); | ||
1077 | + return low + ((uint64_t)high << 32); | ||
1078 | +} | ||
1079 | + | ||
1080 | +uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) | ||
1081 | +{ | ||
1082 | + uint64_t mask; | ||
1083 | + mask = (a ^ ~b) & 0x8000800080008000ull; | ||
1084 | + a |= 0x8000800080008000ull; | ||
1085 | + b &= ~0x8000800080008000ull; | ||
1086 | + return (a - b) ^ mask; | ||
1087 | +} | ||
1088 | + | ||
1089 | +uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) | ||
1090 | +{ | ||
1091 | + uint64_t mask; | ||
1092 | + mask = (a ^ ~b) & 0x8000000080000000ull; | ||
1093 | + a |= 0x8000000080000000ull; | ||
1094 | + b &= ~0x8000000080000000ull; | ||
1095 | + return (a - b) ^ mask; | ||
1096 | +} | ||
1097 | + | ||
1098 | +uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b) | ||
1099 | +{ | ||
1100 | + uint32_t x, y; | ||
1101 | + uint32_t low, high; | ||
1102 | + | ||
1103 | + x = a; | ||
1104 | + y = b; | ||
1105 | + low = x + y; | ||
1106 | + if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { | ||
1107 | + SET_QC(); | ||
1108 | + low = ((int32_t)x >> 31) ^ ~SIGNBIT; | ||
1109 | + } | ||
1110 | + x = a >> 32; | ||
1111 | + y = b >> 32; | ||
1112 | + high = x + y; | ||
1113 | + if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { | ||
1114 | + SET_QC(); | ||
1115 | + high = ((int32_t)x >> 31) ^ ~SIGNBIT; | ||
1116 | + } | ||
1117 | + return low | ((uint64_t)high << 32); | ||
1118 | +} | ||
1119 | + | ||
1120 | +uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b) | ||
1121 | +{ | ||
1122 | + uint64_t result; | ||
1123 | + | ||
1124 | + result = a + b; | ||
1125 | + if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { | ||
1126 | + SET_QC(); | ||
1127 | + result = ((int64_t)a >> 63) ^ ~SIGNBIT64; | ||
1128 | + } | ||
1129 | + return result; | ||
1130 | +} | ||
1131 | + | ||
1132 | +#define DO_ABD(dest, x, y, type) do { \ | ||
1133 | + type tmp_x = x; \ | ||
1134 | + type tmp_y = y; \ | ||
1135 | + dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ | ||
1136 | + } while(0) | ||
1137 | + | ||
1138 | +uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) | ||
1139 | +{ | ||
1140 | + uint64_t tmp; | ||
1141 | + uint64_t result; | ||
1142 | + DO_ABD(result, a, b, uint8_t); | ||
1143 | + DO_ABD(tmp, a >> 8, b >> 8, uint8_t); | ||
1144 | + result |= tmp << 16; | ||
1145 | + DO_ABD(tmp, a >> 16, b >> 16, uint8_t); | ||
1146 | + result |= tmp << 32; | ||
1147 | + DO_ABD(tmp, a >> 24, b >> 24, uint8_t); | ||
1148 | + result |= tmp << 48; | ||
1149 | + return result; | ||
1150 | +} | ||
1151 | + | ||
1152 | +uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) | ||
1153 | +{ | ||
1154 | + uint64_t tmp; | ||
1155 | + uint64_t result; | ||
1156 | + DO_ABD(result, a, b, int8_t); | ||
1157 | + DO_ABD(tmp, a >> 8, b >> 8, int8_t); | ||
1158 | + result |= tmp << 16; | ||
1159 | + DO_ABD(tmp, a >> 16, b >> 16, int8_t); | ||
1160 | + result |= tmp << 32; | ||
1161 | + DO_ABD(tmp, a >> 24, b >> 24, int8_t); | ||
1162 | + result |= tmp << 48; | ||
1163 | + return result; | ||
1164 | +} | ||
1165 | + | ||
1166 | +uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) | ||
1167 | +{ | ||
1168 | + uint64_t tmp; | ||
1169 | + uint64_t result; | ||
1170 | + DO_ABD(result, a, b, uint16_t); | ||
1171 | + DO_ABD(tmp, a >> 16, b >> 16, uint16_t); | ||
1172 | + return result | (tmp << 32); | ||
1173 | +} | ||
1174 | + | ||
1175 | +uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) | ||
1176 | +{ | ||
1177 | + uint64_t tmp; | ||
1178 | + uint64_t result; | ||
1179 | + DO_ABD(result, a, b, int16_t); | ||
1180 | + DO_ABD(tmp, a >> 16, b >> 16, int16_t); | ||
1181 | + return result | (tmp << 32); | ||
1182 | +} | ||
1183 | + | ||
1184 | +uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) | ||
1185 | +{ | ||
1186 | + uint64_t result; | ||
1187 | + DO_ABD(result, a, b, uint32_t); | ||
1188 | + return result; | ||
1189 | +} | ||
1190 | + | ||
1191 | +uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) | ||
1192 | +{ | ||
1193 | + uint64_t result; | ||
1194 | + DO_ABD(result, a, b, int32_t); | ||
1195 | + return result; | ||
1196 | +} | ||
1197 | +#undef DO_ABD | ||
1198 | + | ||
1199 | +/* Widening multiply. Named type is the source type. */ | ||
1200 | +#define DO_MULL(dest, x, y, type1, type2) do { \ | ||
1201 | + type1 tmp_x = x; \ | ||
1202 | + type1 tmp_y = y; \ | ||
1203 | + dest = (type2)((type2)tmp_x * (type2)tmp_y); \ | ||
1204 | + } while(0) | ||
1205 | + | ||
1206 | +uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) | ||
1207 | +{ | ||
1208 | + uint64_t tmp; | ||
1209 | + uint64_t result; | ||
1210 | + | ||
1211 | + DO_MULL(result, a, b, uint8_t, uint16_t); | ||
1212 | + DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); | ||
1213 | + result |= tmp << 16; | ||
1214 | + DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); | ||
1215 | + result |= tmp << 32; | ||
1216 | + DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); | ||
1217 | + result |= tmp << 48; | ||
1218 | + return result; | ||
1219 | +} | ||
1220 | + | ||
1221 | +uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) | ||
1222 | +{ | ||
1223 | + uint64_t tmp; | ||
1224 | + uint64_t result; | ||
1225 | + | ||
1226 | + DO_MULL(result, a, b, int8_t, uint16_t); | ||
1227 | + DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); | ||
1228 | + result |= tmp << 16; | ||
1229 | + DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); | ||
1230 | + result |= tmp << 32; | ||
1231 | + DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); | ||
1232 | + result |= tmp << 48; | ||
1233 | + return result; | ||
1234 | +} | ||
1235 | + | ||
1236 | +uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) | ||
1237 | +{ | ||
1238 | + uint64_t tmp; | ||
1239 | + uint64_t result; | ||
1240 | + | ||
1241 | + DO_MULL(result, a, b, uint16_t, uint32_t); | ||
1242 | + DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); | ||
1243 | + return result | (tmp << 32); | ||
1244 | +} | ||
1245 | + | ||
1246 | +uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) | ||
1247 | +{ | ||
1248 | + uint64_t tmp; | ||
1249 | + uint64_t result; | ||
1250 | + | ||
1251 | + DO_MULL(result, a, b, int16_t, uint32_t); | ||
1252 | + DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); | ||
1253 | + return result | (tmp << 32); | ||
1254 | +} | ||
1255 | + | ||
1256 | +uint64_t HELPER(neon_negl_u16)(uint64_t x) | ||
1257 | +{ | ||
1258 | + uint16_t tmp; | ||
1259 | + uint64_t result; | ||
1260 | + result = (uint16_t)-x; | ||
1261 | + tmp = -(x >> 16); | ||
1262 | + result |= (uint64_t)tmp << 16; | ||
1263 | + tmp = -(x >> 32); | ||
1264 | + result |= (uint64_t)tmp << 32; | ||
1265 | + tmp = -(x >> 48); | ||
1266 | + result |= (uint64_t)tmp << 48; | ||
1267 | + return result; | ||
1268 | +} | ||
1269 | + | ||
1270 | +#include <stdio.h> | ||
1271 | +uint64_t HELPER(neon_negl_u32)(uint64_t x) | ||
1272 | +{ | ||
1273 | + uint32_t low = -x; | ||
1274 | + uint32_t high = -(x >> 32); | ||
1275 | + return low | ((uint64_t)high << 32); | ||
1276 | +} | ||
1277 | + | ||
1278 | +/* FIXME: There should be a native op for this. */ | ||
1279 | +uint64_t HELPER(neon_negl_u64)(uint64_t x) | ||
1280 | +{ | ||
1281 | + return -x; | ||
1282 | +} | ||
1283 | + | ||
1284 | +/* Saturnating sign manuipulation. */ | ||
1285 | +/* ??? Make these use NEON_VOP1 */ | ||
1286 | +#define DO_QABS8(x) do { \ | ||
1287 | + if (x == (int8_t)0x80) { \ | ||
1288 | + x = 0x7f; \ | ||
1289 | + SET_QC(); \ | ||
1290 | + } else if (x < 0) { \ | ||
1291 | + x = -x; \ | ||
1292 | + }} while (0) | ||
1293 | +uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x) | ||
1294 | +{ | ||
1295 | + neon_s8 vec; | ||
1296 | + NEON_UNPACK(neon_s8, vec, x); | ||
1297 | + DO_QABS8(vec.v1); | ||
1298 | + DO_QABS8(vec.v2); | ||
1299 | + DO_QABS8(vec.v3); | ||
1300 | + DO_QABS8(vec.v4); | ||
1301 | + NEON_PACK(neon_s8, x, vec); | ||
1302 | + return x; | ||
1303 | +} | ||
1304 | +#undef DO_QABS8 | ||
1305 | + | ||
1306 | +#define DO_QNEG8(x) do { \ | ||
1307 | + if (x == (int8_t)0x80) { \ | ||
1308 | + x = 0x7f; \ | ||
1309 | + SET_QC(); \ | ||
1310 | + } else { \ | ||
1311 | + x = -x; \ | ||
1312 | + }} while (0) | ||
1313 | +uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x) | ||
1314 | +{ | ||
1315 | + neon_s8 vec; | ||
1316 | + NEON_UNPACK(neon_s8, vec, x); | ||
1317 | + DO_QNEG8(vec.v1); | ||
1318 | + DO_QNEG8(vec.v2); | ||
1319 | + DO_QNEG8(vec.v3); | ||
1320 | + DO_QNEG8(vec.v4); | ||
1321 | + NEON_PACK(neon_s8, x, vec); | ||
1322 | + return x; | ||
1323 | +} | ||
1324 | +#undef DO_QNEG8 | ||
1325 | + | ||
1326 | +#define DO_QABS16(x) do { \ | ||
1327 | + if (x == (int16_t)0x8000) { \ | ||
1328 | + x = 0x7fff; \ | ||
1329 | + SET_QC(); \ | ||
1330 | + } else if (x < 0) { \ | ||
1331 | + x = -x; \ | ||
1332 | + }} while (0) | ||
1333 | +uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x) | ||
1334 | +{ | ||
1335 | + neon_s16 vec; | ||
1336 | + NEON_UNPACK(neon_s16, vec, x); | ||
1337 | + DO_QABS16(vec.v1); | ||
1338 | + DO_QABS16(vec.v2); | ||
1339 | + NEON_PACK(neon_s16, x, vec); | ||
1340 | + return x; | ||
1341 | +} | ||
1342 | +#undef DO_QABS16 | ||
1343 | + | ||
1344 | +#define DO_QNEG16(x) do { \ | ||
1345 | + if (x == (int16_t)0x8000) { \ | ||
1346 | + x = 0x7fff; \ | ||
1347 | + SET_QC(); \ | ||
1348 | + } else { \ | ||
1349 | + x = -x; \ | ||
1350 | + }} while (0) | ||
1351 | +uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x) | ||
1352 | +{ | ||
1353 | + neon_s16 vec; | ||
1354 | + NEON_UNPACK(neon_s16, vec, x); | ||
1355 | + DO_QNEG16(vec.v1); | ||
1356 | + DO_QNEG16(vec.v2); | ||
1357 | + NEON_PACK(neon_s16, x, vec); | ||
1358 | + return x; | ||
1359 | +} | ||
1360 | +#undef DO_QNEG16 | ||
1361 | + | ||
1362 | +uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x) | ||
1363 | +{ | ||
1364 | + if (x == SIGNBIT) { | ||
1365 | + SET_QC(); | ||
1366 | + x = ~SIGNBIT; | ||
1367 | + } else if ((int32_t)x < 0) { | ||
1368 | + x = -x; | ||
1369 | + } | ||
1370 | + return x; | ||
1371 | +} | ||
1372 | + | ||
1373 | +uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x) | ||
1374 | +{ | ||
1375 | + if (x == SIGNBIT) { | ||
1376 | + SET_QC(); | ||
1377 | + x = ~SIGNBIT; | ||
1378 | + } else { | ||
1379 | + x = -x; | ||
1380 | + } | ||
1381 | + return x; | ||
1382 | +} | ||
1383 | + | ||
1384 | +/* NEON Float helpers. */ | ||
1385 | +uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b) | ||
1386 | +{ | ||
1387 | + float32 f0 = vfp_itos(a); | ||
1388 | + float32 f1 = vfp_itos(b); | ||
1389 | + return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b; | ||
1390 | +} | ||
1391 | + | ||
1392 | +uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b) | ||
1393 | +{ | ||
1394 | + float32 f0 = vfp_itos(a); | ||
1395 | + float32 f1 = vfp_itos(b); | ||
1396 | + return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b; | ||
1397 | +} | ||
1398 | + | ||
1399 | +uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b) | ||
1400 | +{ | ||
1401 | + float32 f0 = vfp_itos(a); | ||
1402 | + float32 f1 = vfp_itos(b); | ||
1403 | + return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1) | ||
1404 | + ? float32_sub(f0, f1, NFS) | ||
1405 | + : float32_sub(f1, f0, NFS)); | ||
1406 | +} | ||
1407 | + | ||
1408 | +uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b) | ||
1409 | +{ | ||
1410 | + return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS)); | ||
1411 | +} | ||
1412 | + | ||
1413 | +uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b) | ||
1414 | +{ | ||
1415 | + return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS)); | ||
1416 | +} | ||
1417 | + | ||
1418 | +uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b) | ||
1419 | +{ | ||
1420 | + return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS)); | ||
1421 | +} | ||
1422 | + | ||
1423 | +/* Floating point comparisons produce an integer result. */ | ||
1424 | +#define NEON_VOP_FCMP(name, cmp) \ | ||
1425 | +uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \ | ||
1426 | +{ \ | ||
1427 | + if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \ | ||
1428 | + return ~0; \ | ||
1429 | + else \ | ||
1430 | + return 0; \ | ||
1431 | +} | ||
1432 | + | ||
1433 | +NEON_VOP_FCMP(ceq_f32, ==) | ||
1434 | +NEON_VOP_FCMP(cge_f32, >=) | ||
1435 | +NEON_VOP_FCMP(cgt_f32, >) | ||
1436 | + | ||
1437 | +uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b) | ||
1438 | +{ | ||
1439 | + float32 f0 = float32_abs(vfp_itos(a)); | ||
1440 | + float32 f1 = float32_abs(vfp_itos(b)); | ||
1441 | + return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0; | ||
1442 | +} | ||
1443 | + | ||
1444 | +uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b) | ||
1445 | +{ | ||
1446 | + float32 f0 = float32_abs(vfp_itos(a)); | ||
1447 | + float32 f1 = float32_abs(vfp_itos(b)); | ||
1448 | + return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0; | ||
1449 | +} |
target-arm/op.c
target-arm/op_helper.c
@@ -20,6 +20,9 @@ | @@ -20,6 +20,9 @@ | ||
20 | #include "exec.h" | 20 | #include "exec.h" |
21 | #include "helpers.h" | 21 | #include "helpers.h" |
22 | 22 | ||
23 | +#define SIGNBIT (uint32_t)0x80000000 | ||
24 | +#define SIGNBIT64 ((uint64_t)1 << 63) | ||
25 | + | ||
23 | void raise_exception(int tt) | 26 | void raise_exception(int tt) |
24 | { | 27 | { |
25 | env->exception_index = tt; | 28 | env->exception_index = tt; |
@@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr) | @@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr) | ||
116 | } | 119 | } |
117 | #endif | 120 | #endif |
118 | 121 | ||
119 | -#define SIGNBIT (uint32_t)0x80000000 | 122 | +/* FIXME: Pass an axplicit pointer to QF to CPUState, and move saturating |
123 | + instructions into helper.c */ | ||
120 | uint32_t HELPER(add_setq)(uint32_t a, uint32_t b) | 124 | uint32_t HELPER(add_setq)(uint32_t a, uint32_t b) |
121 | { | 125 | { |
122 | uint32_t res = a + b; | 126 | uint32_t res = a + b; |
@@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i) | @@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i) | ||
451 | } | 455 | } |
452 | } | 456 | } |
453 | 457 | ||
458 | +uint64_t HELPER(neon_add_saturate_s64)(uint64_t src1, uint64_t src2) | ||
459 | +{ | ||
460 | + uint64_t res; | ||
461 | + | ||
462 | + res = src1 + src2; | ||
463 | + if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { | ||
464 | + env->QF = 1; | ||
465 | + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; | ||
466 | + } | ||
467 | + return res; | ||
468 | +} | ||
469 | + | ||
470 | +uint64_t HELPER(neon_add_saturate_u64)(uint64_t src1, uint64_t src2) | ||
471 | +{ | ||
472 | + uint64_t res; | ||
473 | + | ||
474 | + res = src1 + src2; | ||
475 | + if (res < src1) { | ||
476 | + env->QF = 1; | ||
477 | + res = ~(uint64_t)0; | ||
478 | + } | ||
479 | + return res; | ||
480 | +} | ||
481 | + | ||
482 | +uint64_t HELPER(neon_sub_saturate_s64)(uint64_t src1, uint64_t src2) | ||
483 | +{ | ||
484 | + uint64_t res; | ||
485 | + | ||
486 | + res = src1 - src2; | ||
487 | + if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { | ||
488 | + env->QF = 1; | ||
489 | + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; | ||
490 | + } | ||
491 | + return res; | ||
492 | +} | ||
493 | + | ||
494 | +uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2) | ||
495 | +{ | ||
496 | + uint64_t res; | ||
497 | + | ||
498 | + if (src1 < src2) { | ||
499 | + env->QF = 1; | ||
500 | + res = 0; | ||
501 | + } else { | ||
502 | + res = src1 - src2; | ||
503 | + } | ||
504 | + return res; | ||
505 | +} | ||
506 | + | ||
507 | +/* These need to return a pair of value, so still use T0/T1. */ | ||
508 | +/* Transpose. Argument order is rather strange to avoid special casing | ||
509 | + the tranlation code. | ||
510 | + On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */ | ||
511 | +void HELPER(neon_trn_u8)(void) | ||
512 | +{ | ||
513 | + uint32_t rd; | ||
514 | + uint32_t rm; | ||
515 | + rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff); | ||
516 | + rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00); | ||
517 | + T0 = rd; | ||
518 | + T1 = rm; | ||
519 | + FORCE_RET(); | ||
520 | +} | ||
521 | + | ||
522 | +void HELPER(neon_trn_u16)(void) | ||
523 | +{ | ||
524 | + uint32_t rd; | ||
525 | + uint32_t rm; | ||
526 | + rd = (T0 << 16) | (T1 & 0xffff); | ||
527 | + rm = (T1 >> 16) | (T0 & 0xffff0000); | ||
528 | + T0 = rd; | ||
529 | + T1 = rm; | ||
530 | + FORCE_RET(); | ||
531 | +} | ||
532 | + | ||
533 | +/* Worker routines for zip and unzip. */ | ||
534 | +void HELPER(neon_unzip_u8)(void) | ||
535 | +{ | ||
536 | + uint32_t rd; | ||
537 | + uint32_t rm; | ||
538 | + rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00) | ||
539 | + | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000); | ||
540 | + rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) | ||
541 | + | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); | ||
542 | + T0 = rd; | ||
543 | + T1 = rm; | ||
544 | + FORCE_RET(); | ||
545 | +} | ||
546 | + | ||
547 | +void HELPER(neon_zip_u8)(void) | ||
548 | +{ | ||
549 | + uint32_t rd; | ||
550 | + uint32_t rm; | ||
551 | + rd = (T0 & 0xff) | ((T1 << 8) & 0xff00) | ||
552 | + | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000); | ||
553 | + rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00) | ||
554 | + | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000); | ||
555 | + T0 = rd; | ||
556 | + T1 = rm; | ||
557 | + FORCE_RET(); | ||
558 | +} | ||
559 | + | ||
560 | +void HELPER(neon_zip_u16)(void) | ||
561 | +{ | ||
562 | + uint32_t tmp; | ||
563 | + | ||
564 | + tmp = (T0 & 0xffff) | (T1 << 16); | ||
565 | + T1 = (T1 & 0xffff0000) | (T0 >> 16); | ||
566 | + T0 = tmp; | ||
567 | + FORCE_RET(); | ||
568 | +} |
target-arm/op_neon.h deleted
100644 โ 0
1 | -/* | ||
2 | - * ARM NEON vector operations. | ||
3 | - * | ||
4 | - * Copyright (c) 2007 CodeSourcery. | ||
5 | - * Written by Paul Brook | ||
6 | - * | ||
7 | - * This code is licenced under the GPL. | ||
8 | - */ | ||
9 | -/* Note that for NEON an "l" prefix means it is a wide operation, unlike | ||
10 | - scalar arm ops where it means a word size operation. */ | ||
11 | - | ||
12 | -#define SIGNBIT (uint32_t)0x80000000 | ||
13 | -/* ??? NEON ops should probably have their own float status. */ | ||
14 | -#define NFS &env->vfp.fp_status | ||
15 | -#define NEON_OP(name) void OPPROTO op_neon_##name (void) | ||
16 | - | ||
17 | -/* Helper routines to perform bitwise copies between float and int. */ | ||
18 | -static inline float32 vfp_itos(uint32_t i) | ||
19 | -{ | ||
20 | - union { | ||
21 | - uint32_t i; | ||
22 | - float32 s; | ||
23 | - } v; | ||
24 | - | ||
25 | - v.i = i; | ||
26 | - return v.s; | ||
27 | -} | ||
28 | - | ||
29 | -static inline uint32_t vfp_stoi(float32 s) | ||
30 | -{ | ||
31 | - union { | ||
32 | - uint32_t i; | ||
33 | - float32 s; | ||
34 | - } v; | ||
35 | - | ||
36 | - v.s = s; | ||
37 | - return v.i; | ||
38 | -} | ||
39 | - | ||
40 | -NEON_OP(getreg_T0) | ||
41 | -{ | ||
42 | - T0 = *(uint32_t *)((char *) env + PARAM1); | ||
43 | -} | ||
44 | - | ||
45 | -NEON_OP(getreg_T1) | ||
46 | -{ | ||
47 | - T1 = *(uint32_t *)((char *) env + PARAM1); | ||
48 | -} | ||
49 | - | ||
50 | -NEON_OP(setreg_T0) | ||
51 | -{ | ||
52 | - *(uint32_t *)((char *) env + PARAM1) = T0; | ||
53 | -} | ||
54 | - | ||
55 | -NEON_OP(setreg_T1) | ||
56 | -{ | ||
57 | - *(uint32_t *)((char *) env + PARAM1) = T1; | ||
58 | -} | ||
59 | - | ||
60 | -#define NEON_TYPE1(name, type) \ | ||
61 | -typedef struct \ | ||
62 | -{ \ | ||
63 | - type v1; \ | ||
64 | -} neon_##name; | ||
65 | -#ifdef WORDS_BIGENDIAN | ||
66 | -#define NEON_TYPE2(name, type) \ | ||
67 | -typedef struct \ | ||
68 | -{ \ | ||
69 | - type v2; \ | ||
70 | - type v1; \ | ||
71 | -} neon_##name; | ||
72 | -#define NEON_TYPE4(name, type) \ | ||
73 | -typedef struct \ | ||
74 | -{ \ | ||
75 | - type v4; \ | ||
76 | - type v3; \ | ||
77 | - type v2; \ | ||
78 | - type v1; \ | ||
79 | -} neon_##name; | ||
80 | -#else | ||
81 | -#define NEON_TYPE2(name, type) \ | ||
82 | -typedef struct \ | ||
83 | -{ \ | ||
84 | - type v1; \ | ||
85 | - type v2; \ | ||
86 | -} neon_##name; | ||
87 | -#define NEON_TYPE4(name, type) \ | ||
88 | -typedef struct \ | ||
89 | -{ \ | ||
90 | - type v1; \ | ||
91 | - type v2; \ | ||
92 | - type v3; \ | ||
93 | - type v4; \ | ||
94 | -} neon_##name; | ||
95 | -#endif | ||
96 | - | ||
97 | -NEON_TYPE4(s8, int8_t) | ||
98 | -NEON_TYPE4(u8, uint8_t) | ||
99 | -NEON_TYPE2(s16, int16_t) | ||
100 | -NEON_TYPE2(u16, uint16_t) | ||
101 | -NEON_TYPE1(s32, int32_t) | ||
102 | -NEON_TYPE1(u32, uint32_t) | ||
103 | -#undef NEON_TYPE4 | ||
104 | -#undef NEON_TYPE2 | ||
105 | -#undef NEON_TYPE1 | ||
106 | - | ||
107 | -/* Copy from a uint32_t to a vector structure type. */ | ||
108 | -#define NEON_UNPACK(vtype, dest, val) do { \ | ||
109 | - union { \ | ||
110 | - vtype v; \ | ||
111 | - uint32_t i; \ | ||
112 | - } conv_u; \ | ||
113 | - conv_u.i = (val); \ | ||
114 | - dest = conv_u.v; \ | ||
115 | - } while(0) | ||
116 | - | ||
117 | -/* Copy from a vector structure type to a uint32_t. */ | ||
118 | -#define NEON_PACK(vtype, dest, val) do { \ | ||
119 | - union { \ | ||
120 | - vtype v; \ | ||
121 | - uint32_t i; \ | ||
122 | - } conv_u; \ | ||
123 | - conv_u.v = (val); \ | ||
124 | - dest = conv_u.i; \ | ||
125 | - } while(0) | ||
126 | - | ||
127 | -#define NEON_DO1 \ | ||
128 | - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); | ||
129 | -#define NEON_DO2 \ | ||
130 | - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ | ||
131 | - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); | ||
132 | -#define NEON_DO4 \ | ||
133 | - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ | ||
134 | - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ | ||
135 | - NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ | ||
136 | - NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); | ||
137 | - | ||
138 | -#define NEON_VOP(name, vtype, n) \ | ||
139 | -NEON_OP(name) \ | ||
140 | -{ \ | ||
141 | - vtype vsrc1; \ | ||
142 | - vtype vsrc2; \ | ||
143 | - vtype vdest; \ | ||
144 | - NEON_UNPACK(vtype, vsrc1, T0); \ | ||
145 | - NEON_UNPACK(vtype, vsrc2, T1); \ | ||
146 | - NEON_DO##n; \ | ||
147 | - NEON_PACK(vtype, T0, vdest); \ | ||
148 | - FORCE_RET(); \ | ||
149 | -} | ||
150 | - | ||
151 | -#define NEON_VOP1(name, vtype, n) \ | ||
152 | -NEON_OP(name) \ | ||
153 | -{ \ | ||
154 | - vtype vsrc1; \ | ||
155 | - vtype vdest; \ | ||
156 | - NEON_UNPACK(vtype, vsrc1, T0); \ | ||
157 | - NEON_DO##n; \ | ||
158 | - NEON_PACK(vtype, T0, vdest); \ | ||
159 | - FORCE_RET(); \ | ||
160 | -} | ||
161 | - | ||
162 | -/* Pairwise operations. */ | ||
163 | -/* For 32-bit elements each segment only contains a single element, so | ||
164 | - the elementwise and pairwise operations are the same. */ | ||
165 | -#define NEON_PDO2 \ | ||
166 | - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ | ||
167 | - NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); | ||
168 | -#define NEON_PDO4 \ | ||
169 | - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ | ||
170 | - NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ | ||
171 | - NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ | ||
172 | - NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ | ||
173 | - | ||
174 | -#define NEON_POP(name, vtype, n) \ | ||
175 | -NEON_OP(name) \ | ||
176 | -{ \ | ||
177 | - vtype vsrc1; \ | ||
178 | - vtype vsrc2; \ | ||
179 | - vtype vdest; \ | ||
180 | - NEON_UNPACK(vtype, vsrc1, T0); \ | ||
181 | - NEON_UNPACK(vtype, vsrc2, T1); \ | ||
182 | - NEON_PDO##n; \ | ||
183 | - NEON_PACK(vtype, T0, vdest); \ | ||
184 | - FORCE_RET(); \ | ||
185 | -} | ||
186 | - | ||
187 | -#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 | ||
188 | -NEON_VOP(hadd_s8, neon_s8, 4) | ||
189 | -NEON_VOP(hadd_u8, neon_u8, 4) | ||
190 | -NEON_VOP(hadd_s16, neon_s16, 2) | ||
191 | -NEON_VOP(hadd_u16, neon_u16, 2) | ||
192 | -#undef NEON_FN | ||
193 | - | ||
194 | -NEON_OP(hadd_s32) | ||
195 | -{ | ||
196 | - int32_t src1 = T0; | ||
197 | - int32_t src2 = T1; | ||
198 | - int32_t dest; | ||
199 | - | ||
200 | - dest = (src1 >> 1) + (src2 >> 1); | ||
201 | - if (src1 & src2 & 1) | ||
202 | - dest++; | ||
203 | - T0 = dest; | ||
204 | - FORCE_RET(); | ||
205 | -} | ||
206 | - | ||
207 | -NEON_OP(hadd_u32) | ||
208 | -{ | ||
209 | - uint32_t src1 = T0; | ||
210 | - uint32_t src2 = T1; | ||
211 | - uint32_t dest; | ||
212 | - | ||
213 | - dest = (src1 >> 1) + (src2 >> 1); | ||
214 | - if (src1 & src2 & 1) | ||
215 | - dest++; | ||
216 | - T0 = dest; | ||
217 | - FORCE_RET(); | ||
218 | -} | ||
219 | - | ||
220 | -#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 | ||
221 | -NEON_VOP(rhadd_s8, neon_s8, 4) | ||
222 | -NEON_VOP(rhadd_u8, neon_u8, 4) | ||
223 | -NEON_VOP(rhadd_s16, neon_s16, 2) | ||
224 | -NEON_VOP(rhadd_u16, neon_u16, 2) | ||
225 | -#undef NEON_FN | ||
226 | - | ||
227 | -NEON_OP(rhadd_s32) | ||
228 | -{ | ||
229 | - int32_t src1 = T0; | ||
230 | - int32_t src2 = T1; | ||
231 | - int32_t dest; | ||
232 | - | ||
233 | - dest = (src1 >> 1) + (src2 >> 1); | ||
234 | - if ((src1 | src2) & 1) | ||
235 | - dest++; | ||
236 | - T0 = dest; | ||
237 | - FORCE_RET(); | ||
238 | -} | ||
239 | - | ||
240 | -NEON_OP(rhadd_u32) | ||
241 | -{ | ||
242 | - uint32_t src1 = T0; | ||
243 | - uint32_t src2 = T1; | ||
244 | - uint32_t dest; | ||
245 | - | ||
246 | - dest = (src1 >> 1) + (src2 >> 1); | ||
247 | - if ((src1 | src2) & 1) | ||
248 | - dest++; | ||
249 | - T0 = dest; | ||
250 | - FORCE_RET(); | ||
251 | -} | ||
252 | - | ||
253 | -#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 | ||
254 | -NEON_VOP(hsub_s8, neon_s8, 4) | ||
255 | -NEON_VOP(hsub_u8, neon_u8, 4) | ||
256 | -NEON_VOP(hsub_s16, neon_s16, 2) | ||
257 | -NEON_VOP(hsub_u16, neon_u16, 2) | ||
258 | -#undef NEON_FN | ||
259 | - | ||
260 | -NEON_OP(hsub_s32) | ||
261 | -{ | ||
262 | - int32_t src1 = T0; | ||
263 | - int32_t src2 = T1; | ||
264 | - int32_t dest; | ||
265 | - | ||
266 | - dest = (src1 >> 1) - (src2 >> 1); | ||
267 | - if ((~src1) & src2 & 1) | ||
268 | - dest--; | ||
269 | - T0 = dest; | ||
270 | - FORCE_RET(); | ||
271 | -} | ||
272 | - | ||
273 | -NEON_OP(hsub_u32) | ||
274 | -{ | ||
275 | - uint32_t src1 = T0; | ||
276 | - uint32_t src2 = T1; | ||
277 | - uint32_t dest; | ||
278 | - | ||
279 | - dest = (src1 >> 1) - (src2 >> 1); | ||
280 | - if ((~src1) & src2 & 1) | ||
281 | - dest--; | ||
282 | - T0 = dest; | ||
283 | - FORCE_RET(); | ||
284 | -} | ||
285 | - | ||
286 | -#define NEON_USAT(dest, src1, src2, type) do { \ | ||
287 | - uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ | ||
288 | - if (tmp != (type)tmp) { \ | ||
289 | - env->QF = 1; \ | ||
290 | - dest = ~0; \ | ||
291 | - } else { \ | ||
292 | - dest = tmp; \ | ||
293 | - }} while(0) | ||
294 | -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) | ||
295 | -NEON_VOP(qadd_u8, neon_u8, 4) | ||
296 | -#undef NEON_FN | ||
297 | -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) | ||
298 | -NEON_VOP(qadd_u16, neon_u16, 2) | ||
299 | -#undef NEON_FN | ||
300 | -#undef NEON_USAT | ||
301 | - | ||
302 | -#define NEON_SSAT(dest, src1, src2, type) do { \ | ||
303 | - int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ | ||
304 | - if (tmp != (type)tmp) { \ | ||
305 | - env->QF = 1; \ | ||
306 | - if (src2 > 0) { \ | ||
307 | - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ | ||
308 | - } else { \ | ||
309 | - tmp = 1 << (sizeof(type) * 8 - 1); \ | ||
310 | - } \ | ||
311 | - } \ | ||
312 | - dest = tmp; \ | ||
313 | - } while(0) | ||
314 | -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) | ||
315 | -NEON_VOP(qadd_s8, neon_s8, 4) | ||
316 | -#undef NEON_FN | ||
317 | -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) | ||
318 | -NEON_VOP(qadd_s16, neon_s16, 2) | ||
319 | -#undef NEON_FN | ||
320 | -#undef NEON_SSAT | ||
321 | - | ||
322 | -#define NEON_USAT(dest, src1, src2, type) do { \ | ||
323 | - uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ | ||
324 | - if (tmp != (type)tmp) { \ | ||
325 | - env->QF = 1; \ | ||
326 | - dest = 0; \ | ||
327 | - } else { \ | ||
328 | - dest = tmp; \ | ||
329 | - }} while(0) | ||
330 | -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) | ||
331 | -NEON_VOP(qsub_u8, neon_u8, 4) | ||
332 | -#undef NEON_FN | ||
333 | -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) | ||
334 | -NEON_VOP(qsub_u16, neon_u16, 2) | ||
335 | -#undef NEON_FN | ||
336 | -#undef NEON_USAT | ||
337 | - | ||
338 | -#define NEON_SSAT(dest, src1, src2, type) do { \ | ||
339 | - int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ | ||
340 | - if (tmp != (type)tmp) { \ | ||
341 | - env->QF = 1; \ | ||
342 | - if (src2 < 0) { \ | ||
343 | - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ | ||
344 | - } else { \ | ||
345 | - tmp = 1 << (sizeof(type) * 8 - 1); \ | ||
346 | - } \ | ||
347 | - } \ | ||
348 | - dest = tmp; \ | ||
349 | - } while(0) | ||
350 | -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) | ||
351 | -NEON_VOP(qsub_s8, neon_s8, 4) | ||
352 | -#undef NEON_FN | ||
353 | -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) | ||
354 | -NEON_VOP(qsub_s16, neon_s16, 2) | ||
355 | -#undef NEON_FN | ||
356 | -#undef NEON_SSAT | ||
357 | - | ||
358 | -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 | ||
359 | -NEON_VOP(cgt_s8, neon_s8, 4) | ||
360 | -NEON_VOP(cgt_u8, neon_u8, 4) | ||
361 | -NEON_VOP(cgt_s16, neon_s16, 2) | ||
362 | -NEON_VOP(cgt_u16, neon_u16, 2) | ||
363 | -NEON_VOP(cgt_s32, neon_s32, 1) | ||
364 | -NEON_VOP(cgt_u32, neon_u32, 1) | ||
365 | -#undef NEON_FN | ||
366 | - | ||
367 | -#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 | ||
368 | -NEON_VOP(cge_s8, neon_s8, 4) | ||
369 | -NEON_VOP(cge_u8, neon_u8, 4) | ||
370 | -NEON_VOP(cge_s16, neon_s16, 2) | ||
371 | -NEON_VOP(cge_u16, neon_u16, 2) | ||
372 | -NEON_VOP(cge_s32, neon_s32, 1) | ||
373 | -NEON_VOP(cge_u32, neon_u32, 1) | ||
374 | -#undef NEON_FN | ||
375 | - | ||
376 | -#define NEON_FN(dest, src1, src2) do { \ | ||
377 | - int8_t tmp; \ | ||
378 | - tmp = (int8_t)src2; \ | ||
379 | - if (tmp < 0) { \ | ||
380 | - dest = src1 >> -tmp; \ | ||
381 | - } else { \ | ||
382 | - dest = src1 << tmp; \ | ||
383 | - }} while (0) | ||
384 | -NEON_VOP(shl_s8, neon_s8, 4) | ||
385 | -NEON_VOP(shl_u8, neon_u8, 4) | ||
386 | -NEON_VOP(shl_s16, neon_s16, 2) | ||
387 | -NEON_VOP(shl_u16, neon_u16, 2) | ||
388 | -NEON_VOP(shl_s32, neon_s32, 1) | ||
389 | -NEON_VOP(shl_u32, neon_u32, 1) | ||
390 | -#undef NEON_FN | ||
391 | - | ||
392 | -NEON_OP(shl_u64) | ||
393 | -{ | ||
394 | - int8_t shift = env->vfp.scratch[0]; | ||
395 | - uint64_t val = T0 | ((uint64_t)T1 << 32); | ||
396 | - if (shift < 0) { | ||
397 | - val >>= -shift; | ||
398 | - } else { | ||
399 | - val <<= shift; | ||
400 | - } | ||
401 | - T0 = val; | ||
402 | - T1 = val >> 32; | ||
403 | - FORCE_RET(); | ||
404 | -} | ||
405 | - | ||
406 | -NEON_OP(shl_s64) | ||
407 | -{ | ||
408 | - int8_t shift = env->vfp.scratch[0]; | ||
409 | - int64_t val = T0 | ((uint64_t)T1 << 32); | ||
410 | - if (shift < 0) { | ||
411 | - val >>= -shift; | ||
412 | - } else { | ||
413 | - val <<= shift; | ||
414 | - } | ||
415 | - T0 = val; | ||
416 | - T1 = val >> 32; | ||
417 | - FORCE_RET(); | ||
418 | -} | ||
419 | - | ||
420 | -#define NEON_FN(dest, src1, src2) do { \ | ||
421 | - int8_t tmp; \ | ||
422 | - tmp = (int8_t)src1; \ | ||
423 | - if (tmp < 0) { \ | ||
424 | - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ | ||
425 | - } else { \ | ||
426 | - dest = src2 << tmp; \ | ||
427 | - }} while (0) | ||
428 | - | ||
429 | -NEON_VOP(rshl_s8, neon_s8, 4) | ||
430 | -NEON_VOP(rshl_u8, neon_u8, 4) | ||
431 | -NEON_VOP(rshl_s16, neon_s16, 2) | ||
432 | -NEON_VOP(rshl_u16, neon_u16, 2) | ||
433 | -NEON_VOP(rshl_s32, neon_s32, 1) | ||
434 | -NEON_VOP(rshl_u32, neon_u32, 1) | ||
435 | -#undef NEON_FN | ||
436 | - | ||
437 | -NEON_OP(rshl_u64) | ||
438 | -{ | ||
439 | - int8_t shift = env->vfp.scratch[0]; | ||
440 | - uint64_t val = T0 | ((uint64_t)T1 << 32); | ||
441 | - if (shift < 0) { | ||
442 | - val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift; | ||
443 | - val >>= -shift; | ||
444 | - } else { | ||
445 | - val <<= shift; | ||
446 | - } | ||
447 | - T0 = val; | ||
448 | - T1 = val >> 32; | ||
449 | - FORCE_RET(); | ||
450 | -} | ||
451 | - | ||
452 | -NEON_OP(rshl_s64) | ||
453 | -{ | ||
454 | - int8_t shift = env->vfp.scratch[0]; | ||
455 | - int64_t val = T0 | ((uint64_t)T1 << 32); | ||
456 | - if (shift < 0) { | ||
457 | - val = (val + ((int64_t)1 << (-1 - shift))) >> -shift; | ||
458 | - } else { | ||
459 | - val <<= shift; | ||
460 | - } | ||
461 | - T0 = val; | ||
462 | - T1 = val >> 32; | ||
463 | - FORCE_RET(); | ||
464 | -} | ||
465 | - | ||
466 | -#define NEON_FN(dest, src1, src2) do { \ | ||
467 | - int8_t tmp; \ | ||
468 | - tmp = (int8_t)src1; \ | ||
469 | - if (tmp < 0) { \ | ||
470 | - dest = src2 >> -tmp; \ | ||
471 | - } else { \ | ||
472 | - dest = src2 << tmp; \ | ||
473 | - if ((dest >> tmp) != src2) { \ | ||
474 | - env->QF = 1; \ | ||
475 | - dest = ~0; \ | ||
476 | - } \ | ||
477 | - }} while (0) | ||
478 | -NEON_VOP(qshl_s8, neon_s8, 4) | ||
479 | -NEON_VOP(qshl_s16, neon_s16, 2) | ||
480 | -NEON_VOP(qshl_s32, neon_s32, 1) | ||
481 | -#undef NEON_FN | ||
482 | - | ||
483 | -NEON_OP(qshl_s64) | ||
484 | -{ | ||
485 | - int8_t shift = env->vfp.scratch[0]; | ||
486 | - int64_t val = T0 | ((uint64_t)T1 << 32); | ||
487 | - if (shift < 0) { | ||
488 | - val >>= -shift; | ||
489 | - } else { | ||
490 | - int64_t tmp = val; | ||
491 | - val <<= shift; | ||
492 | - if ((val >> shift) != tmp) { | ||
493 | - env->QF = 1; | ||
494 | - val = (tmp >> 63) ^ 0x7fffffffffffffffULL; | ||
495 | - } | ||
496 | - } | ||
497 | - T0 = val; | ||
498 | - T1 = val >> 32; | ||
499 | - FORCE_RET(); | ||
500 | -} | ||
501 | - | ||
502 | -#define NEON_FN(dest, src1, src2) do { \ | ||
503 | - int8_t tmp; \ | ||
504 | - tmp = (int8_t)src1; \ | ||
505 | - if (tmp < 0) { \ | ||
506 | - dest = src2 >> -tmp; \ | ||
507 | - } else { \ | ||
508 | - dest = src2 << tmp; \ | ||
509 | - if ((dest >> tmp) != src2) { \ | ||
510 | - env->QF = 1; \ | ||
511 | - dest = src2 >> 31; \ | ||
512 | - } \ | ||
513 | - }} while (0) | ||
514 | -NEON_VOP(qshl_u8, neon_u8, 4) | ||
515 | -NEON_VOP(qshl_u16, neon_u16, 2) | ||
516 | -NEON_VOP(qshl_u32, neon_u32, 1) | ||
517 | -#undef NEON_FN | ||
518 | - | ||
519 | -NEON_OP(qshl_u64) | ||
520 | -{ | ||
521 | - int8_t shift = env->vfp.scratch[0]; | ||
522 | - uint64_t val = T0 | ((uint64_t)T1 << 32); | ||
523 | - if (shift < 0) { | ||
524 | - val >>= -shift; | ||
525 | - } else { | ||
526 | - uint64_t tmp = val; | ||
527 | - val <<= shift; | ||
528 | - if ((val >> shift) != tmp) { | ||
529 | - env->QF = 1; | ||
530 | - val = ~(uint64_t)0; | ||
531 | - } | ||
532 | - } | ||
533 | - T0 = val; | ||
534 | - T1 = val >> 32; | ||
535 | - FORCE_RET(); | ||
536 | -} | ||
537 | - | ||
538 | -#define NEON_FN(dest, src1, src2) do { \ | ||
539 | - int8_t tmp; \ | ||
540 | - tmp = (int8_t)src1; \ | ||
541 | - if (tmp < 0) { \ | ||
542 | - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ | ||
543 | - } else { \ | ||
544 | - dest = src2 << tmp; \ | ||
545 | - if ((dest >> tmp) != src2) { \ | ||
546 | - dest = ~0; \ | ||
547 | - } \ | ||
548 | - }} while (0) | ||
549 | -NEON_VOP(qrshl_s8, neon_s8, 4) | ||
550 | -NEON_VOP(qrshl_s16, neon_s16, 2) | ||
551 | -NEON_VOP(qrshl_s32, neon_s32, 1) | ||
552 | -#undef NEON_FN | ||
553 | - | ||
554 | -#define NEON_FN(dest, src1, src2) do { \ | ||
555 | - int8_t tmp; \ | ||
556 | - tmp = (int8_t)src1; \ | ||
557 | - if (tmp < 0) { \ | ||
558 | - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ | ||
559 | - } else { \ | ||
560 | - dest = src2 << tmp; \ | ||
561 | - if ((dest >> tmp) != src2) { \ | ||
562 | - env->QF = 1; \ | ||
563 | - dest = src2 >> 31; \ | ||
564 | - } \ | ||
565 | - }} while (0) | ||
566 | -NEON_VOP(qrshl_u8, neon_u8, 4) | ||
567 | -NEON_VOP(qrshl_u16, neon_u16, 2) | ||
568 | -NEON_VOP(qrshl_u32, neon_u32, 1) | ||
569 | -#undef NEON_FN | ||
570 | - | ||
571 | -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 | ||
572 | -NEON_VOP(max_s8, neon_s8, 4) | ||
573 | -NEON_VOP(max_u8, neon_u8, 4) | ||
574 | -NEON_VOP(max_s16, neon_s16, 2) | ||
575 | -NEON_VOP(max_u16, neon_u16, 2) | ||
576 | -NEON_VOP(max_s32, neon_s32, 1) | ||
577 | -NEON_VOP(max_u32, neon_u32, 1) | ||
578 | -NEON_POP(pmax_s8, neon_s8, 4) | ||
579 | -NEON_POP(pmax_u8, neon_u8, 4) | ||
580 | -NEON_POP(pmax_s16, neon_s16, 2) | ||
581 | -NEON_POP(pmax_u16, neon_u16, 2) | ||
582 | -#undef NEON_FN | ||
583 | - | ||
584 | -NEON_OP(max_f32) | ||
585 | -{ | ||
586 | - float32 f0 = vfp_itos(T0); | ||
587 | - float32 f1 = vfp_itos(T1); | ||
588 | - T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1; | ||
589 | - FORCE_RET(); | ||
590 | -} | ||
591 | - | ||
592 | -#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 | ||
593 | -NEON_VOP(min_s8, neon_s8, 4) | ||
594 | -NEON_VOP(min_u8, neon_u8, 4) | ||
595 | -NEON_VOP(min_s16, neon_s16, 2) | ||
596 | -NEON_VOP(min_u16, neon_u16, 2) | ||
597 | -NEON_VOP(min_s32, neon_s32, 1) | ||
598 | -NEON_VOP(min_u32, neon_u32, 1) | ||
599 | -NEON_POP(pmin_s8, neon_s8, 4) | ||
600 | -NEON_POP(pmin_u8, neon_u8, 4) | ||
601 | -NEON_POP(pmin_s16, neon_s16, 2) | ||
602 | -NEON_POP(pmin_u16, neon_u16, 2) | ||
603 | -#undef NEON_FN | ||
604 | - | ||
605 | -NEON_OP(min_f32) | ||
606 | -{ | ||
607 | - float32 f0 = vfp_itos(T0); | ||
608 | - float32 f1 = vfp_itos(T1); | ||
609 | - T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1; | ||
610 | - FORCE_RET(); | ||
611 | -} | ||
612 | - | ||
613 | -#define NEON_FN(dest, src1, src2) \ | ||
614 | - dest = (src1 > src2) ? (src1 - src2) : (src2 - src1) | ||
615 | -NEON_VOP(abd_s8, neon_s8, 4) | ||
616 | -NEON_VOP(abd_u8, neon_u8, 4) | ||
617 | -NEON_VOP(abd_s16, neon_s16, 2) | ||
618 | -NEON_VOP(abd_u16, neon_u16, 2) | ||
619 | -NEON_VOP(abd_s32, neon_s32, 1) | ||
620 | -NEON_VOP(abd_u32, neon_u32, 1) | ||
621 | -#undef NEON_FN | ||
622 | - | ||
623 | -NEON_OP(abd_f32) | ||
624 | -{ | ||
625 | - float32 f0 = vfp_itos(T0); | ||
626 | - float32 f1 = vfp_itos(T1); | ||
627 | - T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1) | ||
628 | - ? float32_sub(f0, f1, NFS) | ||
629 | - : float32_sub(f1, f0, NFS)); | ||
630 | - FORCE_RET(); | ||
631 | -} | ||
632 | - | ||
633 | -#define NEON_FN(dest, src1, src2) dest = src1 + src2 | ||
634 | -NEON_VOP(add_u8, neon_u8, 4) | ||
635 | -NEON_VOP(add_u16, neon_u16, 2) | ||
636 | -NEON_POP(padd_u8, neon_u8, 4) | ||
637 | -NEON_POP(padd_u16, neon_u16, 2) | ||
638 | -#undef NEON_FN | ||
639 | - | ||
640 | -NEON_OP(add_f32) | ||
641 | -{ | ||
642 | - T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS)); | ||
643 | - FORCE_RET(); | ||
644 | -} | ||
645 | - | ||
646 | -#define NEON_FN(dest, src1, src2) dest = src1 - src2 | ||
647 | -NEON_VOP(sub_u8, neon_u8, 4) | ||
648 | -NEON_VOP(sub_u16, neon_u16, 2) | ||
649 | -#undef NEON_FN | ||
650 | - | ||
651 | -NEON_OP(sub_f32) | ||
652 | -{ | ||
653 | - T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS)); | ||
654 | - FORCE_RET(); | ||
655 | -} | ||
656 | - | ||
657 | -#define NEON_FN(dest, src1, src2) dest = src2 - src1 | ||
658 | -NEON_VOP(rsb_u8, neon_u8, 4) | ||
659 | -NEON_VOP(rsb_u16, neon_u16, 2) | ||
660 | -#undef NEON_FN | ||
661 | - | ||
662 | -NEON_OP(rsb_f32) | ||
663 | -{ | ||
664 | - T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS)); | ||
665 | - FORCE_RET(); | ||
666 | -} | ||
667 | - | ||
668 | -#define NEON_FN(dest, src1, src2) dest = src1 * src2 | ||
669 | -NEON_VOP(mul_u8, neon_u8, 4) | ||
670 | -NEON_VOP(mul_u16, neon_u16, 2) | ||
671 | -#undef NEON_FN | ||
672 | - | ||
673 | -NEON_OP(mul_f32) | ||
674 | -{ | ||
675 | - T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS)); | ||
676 | - FORCE_RET(); | ||
677 | -} | ||
678 | - | ||
679 | -NEON_OP(mul_p8) | ||
680 | -{ | ||
681 | - T0 = helper_neon_mul_p8(T0, T1); | ||
682 | -} | ||
683 | - | ||
684 | -#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 | ||
685 | -NEON_VOP(tst_u8, neon_u8, 4) | ||
686 | -NEON_VOP(tst_u16, neon_u16, 2) | ||
687 | -NEON_VOP(tst_u32, neon_u32, 1) | ||
688 | -#undef NEON_FN | ||
689 | - | ||
690 | -#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0 | ||
691 | -NEON_VOP(ceq_u8, neon_u8, 4) | ||
692 | -NEON_VOP(ceq_u16, neon_u16, 2) | ||
693 | -NEON_VOP(ceq_u32, neon_u32, 1) | ||
694 | -#undef NEON_FN | ||
695 | - | ||
696 | -#define NEON_QDMULH16(dest, src1, src2, round) do { \ | ||
697 | - uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ | ||
698 | - if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ | ||
699 | - env->QF = 1; \ | ||
700 | - tmp = (tmp >> 31) ^ ~SIGNBIT; \ | ||
701 | - } \ | ||
702 | - tmp <<= 1; \ | ||
703 | - if (round) { \ | ||
704 | - int32_t old = tmp; \ | ||
705 | - tmp += 1 << 15; \ | ||
706 | - if ((int32_t)tmp < old) { \ | ||
707 | - env->QF = 1; \ | ||
708 | - tmp = SIGNBIT - 1; \ | ||
709 | - } \ | ||
710 | - } \ | ||
711 | - dest = tmp >> 16; \ | ||
712 | - } while(0) | ||
713 | -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) | ||
714 | -NEON_VOP(qdmulh_s16, neon_s16, 2) | ||
715 | -#undef NEON_FN | ||
716 | -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) | ||
717 | -NEON_VOP(qrdmulh_s16, neon_s16, 2) | ||
718 | -#undef NEON_FN | ||
719 | -#undef NEON_QDMULH16 | ||
720 | - | ||
721 | -#define SIGNBIT64 ((uint64_t)1 << 63) | ||
722 | -#define NEON_QDMULH32(dest, src1, src2, round) do { \ | ||
723 | - uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ | ||
724 | - if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ | ||
725 | - env->QF = 1; \ | ||
726 | - tmp = (tmp >> 63) ^ ~SIGNBIT64; \ | ||
727 | - } else { \ | ||
728 | - tmp <<= 1; \ | ||
729 | - } \ | ||
730 | - if (round) { \ | ||
731 | - int64_t old = tmp; \ | ||
732 | - tmp += (int64_t)1 << 31; \ | ||
733 | - if ((int64_t)tmp < old) { \ | ||
734 | - env->QF = 1; \ | ||
735 | - tmp = SIGNBIT64 - 1; \ | ||
736 | - } \ | ||
737 | - } \ | ||
738 | - dest = tmp >> 32; \ | ||
739 | - } while(0) | ||
740 | -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) | ||
741 | -NEON_VOP(qdmulh_s32, neon_s32, 1) | ||
742 | -#undef NEON_FN | ||
743 | -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) | ||
744 | -NEON_VOP(qrdmulh_s32, neon_s32, 1) | ||
745 | -#undef NEON_FN | ||
746 | -#undef NEON_QDMULH32 | ||
747 | - | ||
748 | -/* Floating point comparisons produce an integer result. */ | ||
749 | -#define NEON_VOP_FCMP(name, cmp) \ | ||
750 | -NEON_OP(name) \ | ||
751 | -{ \ | ||
752 | - if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \ | ||
753 | - T0 = -1; \ | ||
754 | - else \ | ||
755 | - T0 = 0; \ | ||
756 | - FORCE_RET(); \ | ||
757 | -} | ||
758 | - | ||
759 | -NEON_VOP_FCMP(ceq_f32, ==) | ||
760 | -NEON_VOP_FCMP(cge_f32, >=) | ||
761 | -NEON_VOP_FCMP(cgt_f32, >) | ||
762 | - | ||
763 | -NEON_OP(acge_f32) | ||
764 | -{ | ||
765 | - float32 f0 = float32_abs(vfp_itos(T0)); | ||
766 | - float32 f1 = float32_abs(vfp_itos(T1)); | ||
767 | - T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0; | ||
768 | - FORCE_RET(); | ||
769 | -} | ||
770 | - | ||
771 | -NEON_OP(acgt_f32) | ||
772 | -{ | ||
773 | - float32 f0 = float32_abs(vfp_itos(T0)); | ||
774 | - float32 f1 = float32_abs(vfp_itos(T1)); | ||
775 | - T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0; | ||
776 | - FORCE_RET(); | ||
777 | -} | ||
778 | - | ||
779 | -/* Narrowing instructions. The named type is the destination type. */ | ||
780 | -NEON_OP(narrow_u8) | ||
781 | -{ | ||
782 | - T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00) | ||
783 | - | ((T1 << 16) & 0xff0000) | (T1 << 24); | ||
784 | - FORCE_RET(); | ||
785 | -} | ||
786 | - | ||
787 | -NEON_OP(narrow_sat_u8) | ||
788 | -{ | ||
789 | - neon_u16 src; | ||
790 | - neon_u8 dest; | ||
791 | -#define SAT8(d, s) \ | ||
792 | - if (s > 0xff) { \ | ||
793 | - d = 0xff; \ | ||
794 | - env->QF = 1; \ | ||
795 | - } else { \ | ||
796 | - d = s; \ | ||
797 | - } | ||
798 | - | ||
799 | - NEON_UNPACK(neon_u16, src, T0); | ||
800 | - SAT8(dest.v1, src.v1); | ||
801 | - SAT8(dest.v2, src.v2); | ||
802 | - NEON_UNPACK(neon_u16, src, T1); | ||
803 | - SAT8(dest.v3, src.v1); | ||
804 | - SAT8(dest.v4, src.v2); | ||
805 | - NEON_PACK(neon_u8, T0, dest); | ||
806 | - FORCE_RET(); | ||
807 | -#undef SAT8 | ||
808 | -} | ||
809 | - | ||
810 | -NEON_OP(narrow_sat_s8) | ||
811 | -{ | ||
812 | - neon_s16 src; | ||
813 | - neon_s8 dest; | ||
814 | -#define SAT8(d, s) \ | ||
815 | - if (s != (uint8_t)s) { \ | ||
816 | - d = (s >> 15) ^ 0x7f; \ | ||
817 | - env->QF = 1; \ | ||
818 | - } else { \ | ||
819 | - d = s; \ | ||
820 | - } | ||
821 | - | ||
822 | - NEON_UNPACK(neon_s16, src, T0); | ||
823 | - SAT8(dest.v1, src.v1); | ||
824 | - SAT8(dest.v2, src.v2); | ||
825 | - NEON_UNPACK(neon_s16, src, T1); | ||
826 | - SAT8(dest.v3, src.v1); | ||
827 | - SAT8(dest.v4, src.v2); | ||
828 | - NEON_PACK(neon_s8, T0, dest); | ||
829 | - FORCE_RET(); | ||
830 | -#undef SAT8 | ||
831 | -} | ||
832 | - | ||
833 | -NEON_OP(narrow_u16) | ||
834 | -{ | ||
835 | - T0 = (T0 & 0xffff) | (T1 << 16); | ||
836 | -} | ||
837 | - | ||
838 | -NEON_OP(narrow_sat_u16) | ||
839 | -{ | ||
840 | - if (T0 > 0xffff) { | ||
841 | - T0 = 0xffff; | ||
842 | - env->QF = 1; | ||
843 | - } | ||
844 | - if (T1 > 0xffff) { | ||
845 | - T1 = 0xffff; | ||
846 | - env->QF = 1; | ||
847 | - } | ||
848 | - T0 |= T1 << 16; | ||
849 | - FORCE_RET(); | ||
850 | -} | ||
851 | - | ||
852 | -NEON_OP(narrow_sat_s16) | ||
853 | -{ | ||
854 | - if ((int32_t)T0 != (int16_t)T0) { | ||
855 | - T0 = ((int32_t)T0 >> 31) ^ 0x7fff; | ||
856 | - env->QF = 1; | ||
857 | - } | ||
858 | - if ((int32_t)T1 != (int16_t) T1) { | ||
859 | - T1 = ((int32_t)T1 >> 31) ^ 0x7fff; | ||
860 | - env->QF = 1; | ||
861 | - } | ||
862 | - T0 = (uint16_t)T0 | (T1 << 16); | ||
863 | - FORCE_RET(); | ||
864 | -} | ||
865 | - | ||
866 | -NEON_OP(narrow_sat_u32) | ||
867 | -{ | ||
868 | - if (T1) { | ||
869 | - T0 = 0xffffffffu; | ||
870 | - env->QF = 1; | ||
871 | - } | ||
872 | - FORCE_RET(); | ||
873 | -} | ||
874 | - | ||
875 | -NEON_OP(narrow_sat_s32) | ||
876 | -{ | ||
877 | - int32_t sign = (int32_t)T1 >> 31; | ||
878 | - | ||
879 | - if ((int32_t)T1 != sign) { | ||
880 | - T0 = sign ^ 0x7fffffff; | ||
881 | - env->QF = 1; | ||
882 | - } | ||
883 | - FORCE_RET(); | ||
884 | -} | ||
885 | - | ||
886 | -/* Narrowing instructions. Named type is the narrow type. */ | ||
887 | -NEON_OP(narrow_high_u8) | ||
888 | -{ | ||
889 | - T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) | ||
890 | - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); | ||
891 | - FORCE_RET(); | ||
892 | -} | ||
893 | - | ||
894 | -NEON_OP(narrow_high_u16) | ||
895 | -{ | ||
896 | - T0 = (T0 >> 16) | (T1 & 0xffff0000); | ||
897 | - FORCE_RET(); | ||
898 | -} | ||
899 | - | ||
900 | -NEON_OP(narrow_high_round_u8) | ||
901 | -{ | ||
902 | - T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00) | ||
903 | - | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000); | ||
904 | - FORCE_RET(); | ||
905 | -} | ||
906 | - | ||
907 | -NEON_OP(narrow_high_round_u16) | ||
908 | -{ | ||
909 | - T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000); | ||
910 | - FORCE_RET(); | ||
911 | -} | ||
912 | - | ||
913 | -NEON_OP(narrow_high_round_u32) | ||
914 | -{ | ||
915 | - if (T0 >= 0x80000000u) | ||
916 | - T0 = T1 + 1; | ||
917 | - else | ||
918 | - T0 = T1; | ||
919 | - FORCE_RET(); | ||
920 | -} | ||
921 | - | ||
922 | -/* Widening instructions. Named type is source type. */ | ||
923 | -NEON_OP(widen_s8) | ||
924 | -{ | ||
925 | - uint32_t src; | ||
926 | - | ||
927 | - src = T0; | ||
928 | - T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16); | ||
929 | - T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16); | ||
930 | -} | ||
931 | - | ||
932 | -NEON_OP(widen_u8) | ||
933 | -{ | ||
934 | - T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff); | ||
935 | - T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff); | ||
936 | -} | ||
937 | - | ||
938 | -NEON_OP(widen_s16) | ||
939 | -{ | ||
940 | - int32_t src; | ||
941 | - | ||
942 | - src = T0; | ||
943 | - T0 = (int16_t)src; | ||
944 | - T1 = src >> 16; | ||
945 | -} | ||
946 | - | ||
947 | -NEON_OP(widen_u16) | ||
948 | -{ | ||
949 | - T1 = T0 >> 16; | ||
950 | - T0 &= 0xffff; | ||
951 | -} | ||
952 | - | ||
953 | -NEON_OP(widen_s32) | ||
954 | -{ | ||
955 | - T1 = (int32_t)T0 >> 31; | ||
956 | - FORCE_RET(); | ||
957 | -} | ||
958 | - | ||
959 | -NEON_OP(widen_high_u8) | ||
960 | -{ | ||
961 | - T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00); | ||
962 | - T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00); | ||
963 | -} | ||
964 | - | ||
965 | -NEON_OP(widen_high_u16) | ||
966 | -{ | ||
967 | - T1 = T0 & 0xffff0000; | ||
968 | - T0 <<= 16; | ||
969 | -} | ||
970 | - | ||
971 | -/* Long operations. The type is the wide type. */ | ||
972 | -NEON_OP(shll_u16) | ||
973 | -{ | ||
974 | - int shift = PARAM1; | ||
975 | - uint32_t mask; | ||
976 | - | ||
977 | - mask = 0xffff >> (16 - shift); | ||
978 | - mask |= mask << 16; | ||
979 | - mask = ~mask; | ||
980 | - | ||
981 | - T0 = (T0 << shift) & mask; | ||
982 | - T1 = (T1 << shift) & mask; | ||
983 | - FORCE_RET(); | ||
984 | -} | ||
985 | - | ||
986 | -NEON_OP(shll_u64) | ||
987 | -{ | ||
988 | - int shift = PARAM1; | ||
989 | - | ||
990 | - T1 <<= shift; | ||
991 | - T1 |= T0 >> (32 - shift); | ||
992 | - T0 <<= shift; | ||
993 | - FORCE_RET(); | ||
994 | -} | ||
995 | - | ||
996 | -NEON_OP(addl_u16) | ||
997 | -{ | ||
998 | - uint32_t tmp; | ||
999 | - uint32_t high; | ||
1000 | - | ||
1001 | - tmp = env->vfp.scratch[0]; | ||
1002 | - high = (T0 >> 16) + (tmp >> 16); | ||
1003 | - T0 = (uint16_t)(T0 + tmp); | ||
1004 | - T0 |= (high << 16); | ||
1005 | - tmp = env->vfp.scratch[1]; | ||
1006 | - high = (T1 >> 16) + (tmp >> 16); | ||
1007 | - T1 = (uint16_t)(T1 + tmp); | ||
1008 | - T1 |= (high << 16); | ||
1009 | - FORCE_RET(); | ||
1010 | -} | ||
1011 | - | ||
1012 | -NEON_OP(addl_u32) | ||
1013 | -{ | ||
1014 | - T0 += env->vfp.scratch[0]; | ||
1015 | - T1 += env->vfp.scratch[1]; | ||
1016 | - FORCE_RET(); | ||
1017 | -} | ||
1018 | - | ||
1019 | -NEON_OP(addl_u64) | ||
1020 | -{ | ||
1021 | - uint64_t tmp; | ||
1022 | - tmp = T0 | ((uint64_t)T1 << 32); | ||
1023 | - tmp += env->vfp.scratch[0]; | ||
1024 | - tmp += (uint64_t)env->vfp.scratch[1] << 32; | ||
1025 | - T0 = tmp; | ||
1026 | - T1 = tmp >> 32; | ||
1027 | - FORCE_RET(); | ||
1028 | -} | ||
1029 | - | ||
1030 | -NEON_OP(subl_u16) | ||
1031 | -{ | ||
1032 | - uint32_t tmp; | ||
1033 | - uint32_t high; | ||
1034 | - | ||
1035 | - tmp = env->vfp.scratch[0]; | ||
1036 | - high = (T0 >> 16) - (tmp >> 16); | ||
1037 | - T0 = (uint16_t)(T0 - tmp); | ||
1038 | - T0 |= (high << 16); | ||
1039 | - tmp = env->vfp.scratch[1]; | ||
1040 | - high = (T1 >> 16) - (tmp >> 16); | ||
1041 | - T1 = (uint16_t)(T1 - tmp); | ||
1042 | - T1 |= (high << 16); | ||
1043 | - FORCE_RET(); | ||
1044 | -} | ||
1045 | - | ||
1046 | -NEON_OP(subl_u32) | ||
1047 | -{ | ||
1048 | - T0 -= env->vfp.scratch[0]; | ||
1049 | - T1 -= env->vfp.scratch[1]; | ||
1050 | - FORCE_RET(); | ||
1051 | -} | ||
1052 | - | ||
1053 | -NEON_OP(subl_u64) | ||
1054 | -{ | ||
1055 | - uint64_t tmp; | ||
1056 | - tmp = T0 | ((uint64_t)T1 << 32); | ||
1057 | - tmp -= env->vfp.scratch[0]; | ||
1058 | - tmp -= (uint64_t)env->vfp.scratch[1] << 32; | ||
1059 | - T0 = tmp; | ||
1060 | - T1 = tmp >> 32; | ||
1061 | - FORCE_RET(); | ||
1062 | -} | ||
1063 | - | ||
1064 | -#define DO_ABD(dest, x, y, type) do { \ | ||
1065 | - type tmp_x = x; \ | ||
1066 | - type tmp_y = y; \ | ||
1067 | - dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ | ||
1068 | - } while(0) | ||
1069 | - | ||
1070 | -NEON_OP(abdl_u16) | ||
1071 | -{ | ||
1072 | - uint32_t tmp; | ||
1073 | - uint32_t low; | ||
1074 | - uint32_t high; | ||
1075 | - | ||
1076 | - DO_ABD(low, T0, T1, uint8_t); | ||
1077 | - DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t); | ||
1078 | - low |= tmp << 16; | ||
1079 | - DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t); | ||
1080 | - DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t); | ||
1081 | - high |= tmp << 16; | ||
1082 | - T0 = low; | ||
1083 | - T1 = high; | ||
1084 | - FORCE_RET(); | ||
1085 | -} | ||
1086 | - | ||
1087 | -NEON_OP(abdl_s16) | ||
1088 | -{ | ||
1089 | - uint32_t tmp; | ||
1090 | - uint32_t low; | ||
1091 | - uint32_t high; | ||
1092 | - | ||
1093 | - DO_ABD(low, T0, T1, int8_t); | ||
1094 | - DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t); | ||
1095 | - low |= tmp << 16; | ||
1096 | - DO_ABD(high, T0 >> 16, T1 >> 16, int8_t); | ||
1097 | - DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t); | ||
1098 | - high |= tmp << 16; | ||
1099 | - T0 = low; | ||
1100 | - T1 = high; | ||
1101 | - FORCE_RET(); | ||
1102 | -} | ||
1103 | - | ||
1104 | -NEON_OP(abdl_u32) | ||
1105 | -{ | ||
1106 | - uint32_t low; | ||
1107 | - uint32_t high; | ||
1108 | - | ||
1109 | - DO_ABD(low, T0, T1, uint16_t); | ||
1110 | - DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t); | ||
1111 | - T0 = low; | ||
1112 | - T1 = high; | ||
1113 | - FORCE_RET(); | ||
1114 | -} | ||
1115 | - | ||
1116 | -NEON_OP(abdl_s32) | ||
1117 | -{ | ||
1118 | - uint32_t low; | ||
1119 | - uint32_t high; | ||
1120 | - | ||
1121 | - DO_ABD(low, T0, T1, int16_t); | ||
1122 | - DO_ABD(high, T0 >> 16, T1 >> 16, int16_t); | ||
1123 | - T0 = low; | ||
1124 | - T1 = high; | ||
1125 | - FORCE_RET(); | ||
1126 | -} | ||
1127 | - | ||
1128 | -NEON_OP(abdl_u64) | ||
1129 | -{ | ||
1130 | - DO_ABD(T0, T0, T1, uint32_t); | ||
1131 | - T1 = 0; | ||
1132 | -} | ||
1133 | - | ||
1134 | -NEON_OP(abdl_s64) | ||
1135 | -{ | ||
1136 | - DO_ABD(T0, T0, T1, int32_t); | ||
1137 | - T1 = 0; | ||
1138 | -} | ||
1139 | -#undef DO_ABD | ||
1140 | - | ||
1141 | -/* Widening multiple. Named type is the source type. */ | ||
1142 | -#define DO_MULL(dest, x, y, type1, type2) do { \ | ||
1143 | - type1 tmp_x = x; \ | ||
1144 | - type1 tmp_y = y; \ | ||
1145 | - dest = (type2)((type2)tmp_x * (type2)tmp_y); \ | ||
1146 | - } while(0) | ||
1147 | - | ||
1148 | -NEON_OP(mull_u8) | ||
1149 | -{ | ||
1150 | - uint32_t tmp; | ||
1151 | - uint32_t low; | ||
1152 | - uint32_t high; | ||
1153 | - | ||
1154 | - DO_MULL(low, T0, T1, uint8_t, uint16_t); | ||
1155 | - DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t); | ||
1156 | - low |= tmp << 16; | ||
1157 | - DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t); | ||
1158 | - DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t); | ||
1159 | - high |= tmp << 16; | ||
1160 | - T0 = low; | ||
1161 | - T1 = high; | ||
1162 | - FORCE_RET(); | ||
1163 | -} | ||
1164 | - | ||
1165 | -NEON_OP(mull_s8) | ||
1166 | -{ | ||
1167 | - uint32_t tmp; | ||
1168 | - uint32_t low; | ||
1169 | - uint32_t high; | ||
1170 | - | ||
1171 | - DO_MULL(low, T0, T1, int8_t, uint16_t); | ||
1172 | - DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t); | ||
1173 | - low |= tmp << 16; | ||
1174 | - DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t); | ||
1175 | - DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t); | ||
1176 | - high |= tmp << 16; | ||
1177 | - T0 = low; | ||
1178 | - T1 = high; | ||
1179 | - FORCE_RET(); | ||
1180 | -} | ||
1181 | - | ||
1182 | -NEON_OP(mull_u16) | ||
1183 | -{ | ||
1184 | - uint32_t low; | ||
1185 | - uint32_t high; | ||
1186 | - | ||
1187 | - DO_MULL(low, T0, T1, uint16_t, uint32_t); | ||
1188 | - DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t); | ||
1189 | - T0 = low; | ||
1190 | - T1 = high; | ||
1191 | - FORCE_RET(); | ||
1192 | -} | ||
1193 | - | ||
1194 | -NEON_OP(mull_s16) | ||
1195 | -{ | ||
1196 | - uint32_t low; | ||
1197 | - uint32_t high; | ||
1198 | - | ||
1199 | - DO_MULL(low, T0, T1, int16_t, uint32_t); | ||
1200 | - DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t); | ||
1201 | - T0 = low; | ||
1202 | - T1 = high; | ||
1203 | - FORCE_RET(); | ||
1204 | -} | ||
1205 | - | ||
1206 | -NEON_OP(addl_saturate_s32) | ||
1207 | -{ | ||
1208 | - uint32_t tmp; | ||
1209 | - uint32_t res; | ||
1210 | - | ||
1211 | - tmp = env->vfp.scratch[0]; | ||
1212 | - res = T0 + tmp; | ||
1213 | - if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) { | ||
1214 | - env->QF = 1; | ||
1215 | - T0 = (T0 >> 31) ^ 0x7fffffff; | ||
1216 | - } else { | ||
1217 | - T0 = res; | ||
1218 | - } | ||
1219 | - tmp = env->vfp.scratch[1]; | ||
1220 | - res = T1 + tmp; | ||
1221 | - if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) { | ||
1222 | - env->QF = 1; | ||
1223 | - T1 = (T1 >> 31) ^ 0x7fffffff; | ||
1224 | - } else { | ||
1225 | - T1 = res; | ||
1226 | - } | ||
1227 | - FORCE_RET(); | ||
1228 | -} | ||
1229 | - | ||
1230 | -NEON_OP(addl_saturate_s64) | ||
1231 | -{ | ||
1232 | - uint64_t src1; | ||
1233 | - uint64_t src2; | ||
1234 | - uint64_t res; | ||
1235 | - | ||
1236 | - src1 = T0 + ((uint64_t)T1 << 32); | ||
1237 | - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); | ||
1238 | - res = src1 + src2; | ||
1239 | - if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { | ||
1240 | - env->QF = 1; | ||
1241 | - T0 = ~(int64_t)src1 >> 63; | ||
1242 | - T1 = T0 ^ 0x80000000; | ||
1243 | - } else { | ||
1244 | - T0 = res; | ||
1245 | - T1 = res >> 32; | ||
1246 | - } | ||
1247 | - FORCE_RET(); | ||
1248 | -} | ||
1249 | - | ||
1250 | -NEON_OP(addl_saturate_u64) | ||
1251 | -{ | ||
1252 | - uint64_t src1; | ||
1253 | - uint64_t src2; | ||
1254 | - uint64_t res; | ||
1255 | - | ||
1256 | - src1 = T0 + ((uint64_t)T1 << 32); | ||
1257 | - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); | ||
1258 | - res = src1 + src2; | ||
1259 | - if (res < src1) { | ||
1260 | - env->QF = 1; | ||
1261 | - T0 = 0xffffffff; | ||
1262 | - T1 = 0xffffffff; | ||
1263 | - } else { | ||
1264 | - T0 = res; | ||
1265 | - T1 = res >> 32; | ||
1266 | - } | ||
1267 | - FORCE_RET(); | ||
1268 | -} | ||
1269 | - | ||
1270 | -NEON_OP(subl_saturate_s64) | ||
1271 | -{ | ||
1272 | - uint64_t src1; | ||
1273 | - uint64_t src2; | ||
1274 | - uint64_t res; | ||
1275 | - | ||
1276 | - src1 = T0 + ((uint64_t)T1 << 32); | ||
1277 | - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); | ||
1278 | - res = src1 - src2; | ||
1279 | - if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { | ||
1280 | - env->QF = 1; | ||
1281 | - T0 = ~(int64_t)src1 >> 63; | ||
1282 | - T1 = T0 ^ 0x80000000; | ||
1283 | - } else { | ||
1284 | - T0 = res; | ||
1285 | - T1 = res >> 32; | ||
1286 | - } | ||
1287 | - FORCE_RET(); | ||
1288 | -} | ||
1289 | - | ||
1290 | -NEON_OP(subl_saturate_u64) | ||
1291 | -{ | ||
1292 | - uint64_t src1; | ||
1293 | - uint64_t src2; | ||
1294 | - uint64_t res; | ||
1295 | - | ||
1296 | - src1 = T0 + ((uint64_t)T1 << 32); | ||
1297 | - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); | ||
1298 | - if (src1 < src2) { | ||
1299 | - env->QF = 1; | ||
1300 | - T0 = 0; | ||
1301 | - T1 = 0; | ||
1302 | - } else { | ||
1303 | - res = src1 - src2; | ||
1304 | - T0 = res; | ||
1305 | - T1 = res >> 32; | ||
1306 | - } | ||
1307 | - FORCE_RET(); | ||
1308 | -} | ||
1309 | - | ||
1310 | -NEON_OP(negl_u16) | ||
1311 | -{ | ||
1312 | - uint32_t tmp; | ||
1313 | - tmp = T0 >> 16; | ||
1314 | - tmp = -tmp; | ||
1315 | - T0 = (-T0 & 0xffff) | (tmp << 16); | ||
1316 | - tmp = T1 >> 16; | ||
1317 | - tmp = -tmp; | ||
1318 | - T1 = (-T1 & 0xffff) | (tmp << 16); | ||
1319 | - FORCE_RET(); | ||
1320 | -} | ||
1321 | - | ||
1322 | -NEON_OP(negl_u32) | ||
1323 | -{ | ||
1324 | - T0 = -T0; | ||
1325 | - T1 = -T1; | ||
1326 | - FORCE_RET(); | ||
1327 | -} | ||
1328 | - | ||
1329 | -NEON_OP(negl_u64) | ||
1330 | -{ | ||
1331 | - uint64_t val; | ||
1332 | - | ||
1333 | - val = T0 | ((uint64_t)T1 << 32); | ||
1334 | - val = -val; | ||
1335 | - T0 = val; | ||
1336 | - T1 = val >> 32; | ||
1337 | - FORCE_RET(); | ||
1338 | -} | ||
1339 | - | ||
1340 | -/* Scalar operations. */ | ||
1341 | -NEON_OP(dup_low16) | ||
1342 | -{ | ||
1343 | - T0 = (T0 & 0xffff) | (T0 << 16); | ||
1344 | - FORCE_RET(); | ||
1345 | -} | ||
1346 | - | ||
1347 | -NEON_OP(dup_high16) | ||
1348 | -{ | ||
1349 | - T0 = (T0 >> 16) | (T0 & 0xffff0000); | ||
1350 | - FORCE_RET(); | ||
1351 | -} | ||
1352 | - | ||
1353 | -/* Helper for VEXT */ | ||
1354 | -NEON_OP(extract) | ||
1355 | -{ | ||
1356 | - int shift = PARAM1; | ||
1357 | - T0 = (T0 >> shift) | (T1 << (32 - shift)); | ||
1358 | - FORCE_RET(); | ||
1359 | -} | ||
1360 | - | ||
1361 | -/* Pairwise add long. Named type is source type. */ | ||
1362 | -NEON_OP(paddl_s8) | ||
1363 | -{ | ||
1364 | - int8_t src1; | ||
1365 | - int8_t src2; | ||
1366 | - uint16_t result; | ||
1367 | - src1 = T0 >> 24; | ||
1368 | - src2 = T0 >> 16; | ||
1369 | - result = (uint16_t)src1 + src2; | ||
1370 | - src1 = T0 >> 8; | ||
1371 | - src2 = T0; | ||
1372 | - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16); | ||
1373 | - FORCE_RET(); | ||
1374 | -} | ||
1375 | - | ||
1376 | -NEON_OP(paddl_u8) | ||
1377 | -{ | ||
1378 | - uint8_t src1; | ||
1379 | - uint8_t src2; | ||
1380 | - uint16_t result; | ||
1381 | - src1 = T0 >> 24; | ||
1382 | - src2 = T0 >> 16; | ||
1383 | - result = (uint16_t)src1 + src2; | ||
1384 | - src1 = T0 >> 8; | ||
1385 | - src2 = T0; | ||
1386 | - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16); | ||
1387 | - FORCE_RET(); | ||
1388 | -} | ||
1389 | - | ||
1390 | -NEON_OP(paddl_s16) | ||
1391 | -{ | ||
1392 | - T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16); | ||
1393 | - FORCE_RET(); | ||
1394 | -} | ||
1395 | - | ||
1396 | -NEON_OP(paddl_u16) | ||
1397 | -{ | ||
1398 | - T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16); | ||
1399 | - FORCE_RET(); | ||
1400 | -} | ||
1401 | - | ||
1402 | -NEON_OP(paddl_s32) | ||
1403 | -{ | ||
1404 | - int64_t tmp; | ||
1405 | - tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1; | ||
1406 | - T0 = tmp; | ||
1407 | - T1 = tmp >> 32; | ||
1408 | - FORCE_RET(); | ||
1409 | -} | ||
1410 | - | ||
1411 | -NEON_OP(paddl_u32) | ||
1412 | -{ | ||
1413 | - uint64_t tmp; | ||
1414 | - tmp = (uint64_t)T0 + (uint64_t)T1; | ||
1415 | - T0 = tmp; | ||
1416 | - T1 = tmp >> 32; | ||
1417 | - FORCE_RET(); | ||
1418 | -} | ||
1419 | - | ||
1420 | -/* Count Leading Sign/Zero Bits. */ | ||
1421 | -static inline int do_clz8(uint8_t x) | ||
1422 | -{ | ||
1423 | - int n; | ||
1424 | - for (n = 8; x; n--) | ||
1425 | - x >>= 1; | ||
1426 | - return n; | ||
1427 | -} | ||
1428 | - | ||
1429 | -static inline int do_clz16(uint16_t x) | ||
1430 | -{ | ||
1431 | - int n; | ||
1432 | - for (n = 16; x; n--) | ||
1433 | - x >>= 1; | ||
1434 | - return n; | ||
1435 | -} | ||
1436 | - | ||
1437 | -NEON_OP(clz_u8) | ||
1438 | -{ | ||
1439 | - uint32_t result; | ||
1440 | - uint32_t tmp; | ||
1441 | - | ||
1442 | - tmp = T0; | ||
1443 | - result = do_clz8(tmp); | ||
1444 | - result |= do_clz8(tmp >> 8) << 8; | ||
1445 | - result |= do_clz8(tmp >> 16) << 16; | ||
1446 | - result |= do_clz8(tmp >> 24) << 24; | ||
1447 | - T0 = result; | ||
1448 | - FORCE_RET(); | ||
1449 | -} | ||
1450 | - | ||
1451 | -NEON_OP(clz_u16) | ||
1452 | -{ | ||
1453 | - uint32_t result; | ||
1454 | - uint32_t tmp; | ||
1455 | - tmp = T0; | ||
1456 | - result = do_clz16(tmp); | ||
1457 | - result |= do_clz16(tmp >> 16) << 16; | ||
1458 | - T0 = result; | ||
1459 | - FORCE_RET(); | ||
1460 | -} | ||
1461 | - | ||
1462 | -NEON_OP(cls_s8) | ||
1463 | -{ | ||
1464 | - uint32_t result; | ||
1465 | - int8_t tmp; | ||
1466 | - tmp = T0; | ||
1467 | - result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1; | ||
1468 | - tmp = T0 >> 8; | ||
1469 | - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8; | ||
1470 | - tmp = T0 >> 16; | ||
1471 | - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16; | ||
1472 | - tmp = T0 >> 24; | ||
1473 | - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24; | ||
1474 | - T0 = result; | ||
1475 | - FORCE_RET(); | ||
1476 | -} | ||
1477 | - | ||
1478 | -NEON_OP(cls_s16) | ||
1479 | -{ | ||
1480 | - uint32_t result; | ||
1481 | - int16_t tmp; | ||
1482 | - tmp = T0; | ||
1483 | - result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1; | ||
1484 | - tmp = T0 >> 16; | ||
1485 | - result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16; | ||
1486 | - T0 = result; | ||
1487 | - FORCE_RET(); | ||
1488 | -} | ||
1489 | - | ||
1490 | -NEON_OP(cls_s32) | ||
1491 | -{ | ||
1492 | - int count; | ||
1493 | - if ((int32_t)T0 < 0) | ||
1494 | - T0 = ~T0; | ||
1495 | - for (count = 32; T0 > 0; count--) | ||
1496 | - T0 = T0 >> 1; | ||
1497 | - T0 = count - 1; | ||
1498 | - FORCE_RET(); | ||
1499 | -} | ||
1500 | - | ||
1501 | -/* Bit count. */ | ||
1502 | -NEON_OP(cnt_u8) | ||
1503 | -{ | ||
1504 | - T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555); | ||
1505 | - T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333); | ||
1506 | - T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f); | ||
1507 | - FORCE_RET(); | ||
1508 | -} | ||
1509 | - | ||
1510 | -/* Saturnating negation. */ | ||
1511 | -/* ??? Make these use NEON_VOP1 */ | ||
1512 | -#define DO_QABS8(x) do { \ | ||
1513 | - if (x == (int8_t)0x80) { \ | ||
1514 | - x = 0x7f; \ | ||
1515 | - env->QF = 1; \ | ||
1516 | - } else if (x < 0) { \ | ||
1517 | - x = -x; \ | ||
1518 | - }} while (0) | ||
1519 | -NEON_OP(qabs_s8) | ||
1520 | -{ | ||
1521 | - neon_s8 vec; | ||
1522 | - NEON_UNPACK(neon_s8, vec, T0); | ||
1523 | - DO_QABS8(vec.v1); | ||
1524 | - DO_QABS8(vec.v2); | ||
1525 | - DO_QABS8(vec.v3); | ||
1526 | - DO_QABS8(vec.v4); | ||
1527 | - NEON_PACK(neon_s8, T0, vec); | ||
1528 | - FORCE_RET(); | ||
1529 | -} | ||
1530 | -#undef DO_QABS8 | ||
1531 | - | ||
1532 | -#define DO_QNEG8(x) do { \ | ||
1533 | - if (x == (int8_t)0x80) { \ | ||
1534 | - x = 0x7f; \ | ||
1535 | - env->QF = 1; \ | ||
1536 | - } else { \ | ||
1537 | - x = -x; \ | ||
1538 | - }} while (0) | ||
1539 | -NEON_OP(qneg_s8) | ||
1540 | -{ | ||
1541 | - neon_s8 vec; | ||
1542 | - NEON_UNPACK(neon_s8, vec, T0); | ||
1543 | - DO_QNEG8(vec.v1); | ||
1544 | - DO_QNEG8(vec.v2); | ||
1545 | - DO_QNEG8(vec.v3); | ||
1546 | - DO_QNEG8(vec.v4); | ||
1547 | - NEON_PACK(neon_s8, T0, vec); | ||
1548 | - FORCE_RET(); | ||
1549 | -} | ||
1550 | -#undef DO_QNEG8 | ||
1551 | - | ||
1552 | -#define DO_QABS16(x) do { \ | ||
1553 | - if (x == (int16_t)0x8000) { \ | ||
1554 | - x = 0x7fff; \ | ||
1555 | - env->QF = 1; \ | ||
1556 | - } else if (x < 0) { \ | ||
1557 | - x = -x; \ | ||
1558 | - }} while (0) | ||
1559 | -NEON_OP(qabs_s16) | ||
1560 | -{ | ||
1561 | - neon_s16 vec; | ||
1562 | - NEON_UNPACK(neon_s16, vec, T0); | ||
1563 | - DO_QABS16(vec.v1); | ||
1564 | - DO_QABS16(vec.v2); | ||
1565 | - NEON_PACK(neon_s16, T0, vec); | ||
1566 | - FORCE_RET(); | ||
1567 | -} | ||
1568 | -#undef DO_QABS16 | ||
1569 | - | ||
1570 | -#define DO_QNEG16(x) do { \ | ||
1571 | - if (x == (int16_t)0x8000) { \ | ||
1572 | - x = 0x7fff; \ | ||
1573 | - env->QF = 1; \ | ||
1574 | - } else { \ | ||
1575 | - x = -x; \ | ||
1576 | - }} while (0) | ||
1577 | -NEON_OP(qneg_s16) | ||
1578 | -{ | ||
1579 | - neon_s16 vec; | ||
1580 | - NEON_UNPACK(neon_s16, vec, T0); | ||
1581 | - DO_QNEG16(vec.v1); | ||
1582 | - DO_QNEG16(vec.v2); | ||
1583 | - NEON_PACK(neon_s16, T0, vec); | ||
1584 | - FORCE_RET(); | ||
1585 | -} | ||
1586 | -#undef DO_QNEG16 | ||
1587 | - | ||
1588 | -NEON_OP(qabs_s32) | ||
1589 | -{ | ||
1590 | - if (T0 == 0x80000000) { | ||
1591 | - T0 = 0x7fffffff; | ||
1592 | - env->QF = 1; | ||
1593 | - } else if ((int32_t)T0 < 0) { | ||
1594 | - T0 = -T0; | ||
1595 | - } | ||
1596 | - FORCE_RET(); | ||
1597 | -} | ||
1598 | - | ||
1599 | -NEON_OP(qneg_s32) | ||
1600 | -{ | ||
1601 | - if (T0 == 0x80000000) { | ||
1602 | - T0 = 0x7fffffff; | ||
1603 | - env->QF = 1; | ||
1604 | - } else { | ||
1605 | - T0 = -T0; | ||
1606 | - } | ||
1607 | - FORCE_RET(); | ||
1608 | -} | ||
1609 | - | ||
1610 | -/* Unary opperations */ | ||
1611 | -#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src | ||
1612 | -NEON_VOP1(abs_s8, neon_s8, 4) | ||
1613 | -NEON_VOP1(abs_s16, neon_s16, 2) | ||
1614 | -NEON_OP(abs_s32) | ||
1615 | -{ | ||
1616 | - if ((int32_t)T0 < 0) | ||
1617 | - T0 = -T0; | ||
1618 | - FORCE_RET(); | ||
1619 | -} | ||
1620 | -#undef NEON_FN | ||
1621 | - | ||
1622 | -/* Transpose. Argument order is rather strange to avoid special casing | ||
1623 | - the tranlation code. | ||
1624 | - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */ | ||
1625 | -NEON_OP(trn_u8) | ||
1626 | -{ | ||
1627 | - uint32_t rd; | ||
1628 | - uint32_t rm; | ||
1629 | - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff); | ||
1630 | - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00); | ||
1631 | - T0 = rd; | ||
1632 | - T1 = rm; | ||
1633 | - FORCE_RET(); | ||
1634 | -} | ||
1635 | - | ||
1636 | -NEON_OP(trn_u16) | ||
1637 | -{ | ||
1638 | - uint32_t rd; | ||
1639 | - uint32_t rm; | ||
1640 | - rd = (T0 << 16) | (T1 & 0xffff); | ||
1641 | - rm = (T1 >> 16) | (T0 & 0xffff0000); | ||
1642 | - T0 = rd; | ||
1643 | - T1 = rm; | ||
1644 | - FORCE_RET(); | ||
1645 | -} | ||
1646 | - | ||
1647 | -/* Worker routines for zip and unzip. */ | ||
1648 | -NEON_OP(unzip_u8) | ||
1649 | -{ | ||
1650 | - uint32_t rd; | ||
1651 | - uint32_t rm; | ||
1652 | - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00) | ||
1653 | - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000); | ||
1654 | - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) | ||
1655 | - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); | ||
1656 | - T0 = rd; | ||
1657 | - T1 = rm; | ||
1658 | - FORCE_RET(); | ||
1659 | -} | ||
1660 | - | ||
1661 | -NEON_OP(zip_u8) | ||
1662 | -{ | ||
1663 | - uint32_t rd; | ||
1664 | - uint32_t rm; | ||
1665 | - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00) | ||
1666 | - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000); | ||
1667 | - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00) | ||
1668 | - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000); | ||
1669 | - T0 = rd; | ||
1670 | - T1 = rm; | ||
1671 | - FORCE_RET(); | ||
1672 | -} | ||
1673 | - | ||
1674 | -NEON_OP(zip_u16) | ||
1675 | -{ | ||
1676 | - uint32_t tmp; | ||
1677 | - | ||
1678 | - tmp = (T0 & 0xffff) | (T1 << 16); | ||
1679 | - T1 = (T1 & 0xffff0000) | (T0 >> 16); | ||
1680 | - T0 = tmp; | ||
1681 | - FORCE_RET(); | ||
1682 | -} | ||
1683 | - | ||
1684 | -NEON_OP(dup_u8) | ||
1685 | -{ | ||
1686 | - T0 = (T0 >> PARAM1) & 0xff; | ||
1687 | - T0 |= T0 << 8; | ||
1688 | - T0 |= T0 << 16; | ||
1689 | - FORCE_RET(); | ||
1690 | -} |