Commit ad69471ce5e1284e1cacd053bb0fe8d6175a2f9e

Authored by pbrook
1 parent 8f8e3aa4

ARM TCG conversion 14/16.

git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4151 c046a42c-6fe2-441c-8c8c-71466251a162

Too many changes to show.

To preserve performance only 7 of 8 files are displayed.

Makefile.target
... ... @@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o
211 211 endif
212 212  
213 213 ifeq ($(TARGET_BASE_ARCH), arm)
214   -LIBOBJS+= op_helper.o helper.o
  214 +LIBOBJS+= op_helper.o helper.o neon_helper.o
215 215 endif
216 216  
217 217 ifeq ($(TARGET_BASE_ARCH), sh4)
... ...
target-arm/helper.c
... ... @@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env)
256 256 free(env);
257 257 }
258 258  
259   -/* Polynomial multiplication is like integer multiplcation except the
260   - partial products are XORed, not added. */
261   -uint32_t helper_neon_mul_p8(uint32_t op1, uint32_t op2)
262   -{
263   - uint32_t mask;
264   - uint32_t result;
265   - result = 0;
266   - while (op1) {
267   - mask = 0;
268   - if (op1 & 1)
269   - mask |= 0xff;
270   - if (op1 & (1 << 8))
271   - mask |= (0xff << 8);
272   - if (op1 & (1 << 16))
273   - mask |= (0xff << 16);
274   - if (op1 & (1 << 24))
275   - mask |= (0xff << 24);
276   - result ^= op2 & mask;
277   - op1 = (op1 >> 1) & 0x7f7f7f7f;
278   - op2 = (op2 << 1) & 0xfefefefe;
279   - }
280   - return result;
281   -}
282   -
283 259 uint32_t cpsr_read(CPUARMState *env)
284 260 {
285 261 int ZF;
... ... @@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x)
376 352 return x;
377 353 }
378 354  
  355 +uint32_t HELPER(abs)(uint32_t x)
  356 +{
  357 + return ((int32_t)x < 0) ? -x : x;
  358 +}
  359 +
379 360 #if defined(CONFIG_USER_ONLY)
380 361  
381 362 void do_interrupt (CPUState *env)
... ...
target-arm/helpers.h
... ... @@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t))
84 84 DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t))
85 85 DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t))
86 86 DEF_HELPER_1_1(rbit, uint32_t, (uint32_t))
  87 +DEF_HELPER_1_1(abs, uint32_t, (uint32_t))
87 88  
88 89 #define PAS_OP(pfx) \
89 90 DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
... ... @@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *))
208 209 DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *))
209 210 DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *))
210 211 DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t))
  212 +DEF_HELPER_1_2(neon_add_saturate_u64, uint64_t, (uint64_t, uint64_t))
  213 +DEF_HELPER_1_2(neon_add_saturate_s64, uint64_t, (uint64_t, uint64_t))
  214 +DEF_HELPER_1_2(neon_sub_saturate_u64, uint64_t, (uint64_t, uint64_t))
  215 +DEF_HELPER_1_2(neon_sub_saturate_s64, uint64_t, (uint64_t, uint64_t))
211 216  
212 217 DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t))
213 218 DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t))
... ... @@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t))
223 228 DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t))
224 229 DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t))
225 230  
  231 +/* neon_helper.c */
  232 +DEF_HELPER_1_3(neon_qadd_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  233 +DEF_HELPER_1_3(neon_qadd_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  234 +DEF_HELPER_1_3(neon_qadd_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  235 +DEF_HELPER_1_3(neon_qadd_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  236 +DEF_HELPER_1_3(neon_qsub_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  237 +DEF_HELPER_1_3(neon_qsub_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  238 +DEF_HELPER_1_3(neon_qsub_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  239 +DEF_HELPER_1_3(neon_qsub_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  240 +
  241 +DEF_HELPER_1_2(neon_hadd_s8, uint32_t, (uint32_t, uint32_t))
  242 +DEF_HELPER_1_2(neon_hadd_u8, uint32_t, (uint32_t, uint32_t))
  243 +DEF_HELPER_1_2(neon_hadd_s16, uint32_t, (uint32_t, uint32_t))
  244 +DEF_HELPER_1_2(neon_hadd_u16, uint32_t, (uint32_t, uint32_t))
  245 +DEF_HELPER_1_2(neon_hadd_s32, int32_t, (int32_t, int32_t))
  246 +DEF_HELPER_1_2(neon_hadd_u32, uint32_t, (uint32_t, uint32_t))
  247 +DEF_HELPER_1_2(neon_rhadd_s8, uint32_t, (uint32_t, uint32_t))
  248 +DEF_HELPER_1_2(neon_rhadd_u8, uint32_t, (uint32_t, uint32_t))
  249 +DEF_HELPER_1_2(neon_rhadd_s16, uint32_t, (uint32_t, uint32_t))
  250 +DEF_HELPER_1_2(neon_rhadd_u16, uint32_t, (uint32_t, uint32_t))
  251 +DEF_HELPER_1_2(neon_rhadd_s32, int32_t, (int32_t, int32_t))
  252 +DEF_HELPER_1_2(neon_rhadd_u32, uint32_t, (uint32_t, uint32_t))
  253 +DEF_HELPER_1_2(neon_hsub_s8, uint32_t, (uint32_t, uint32_t))
  254 +DEF_HELPER_1_2(neon_hsub_u8, uint32_t, (uint32_t, uint32_t))
  255 +DEF_HELPER_1_2(neon_hsub_s16, uint32_t, (uint32_t, uint32_t))
  256 +DEF_HELPER_1_2(neon_hsub_u16, uint32_t, (uint32_t, uint32_t))
  257 +DEF_HELPER_1_2(neon_hsub_s32, int32_t, (int32_t, int32_t))
  258 +DEF_HELPER_1_2(neon_hsub_u32, uint32_t, (uint32_t, uint32_t))
  259 +
  260 +DEF_HELPER_1_2(neon_cgt_u8, uint32_t, (uint32_t, uint32_t))
  261 +DEF_HELPER_1_2(neon_cgt_s8, uint32_t, (uint32_t, uint32_t))
  262 +DEF_HELPER_1_2(neon_cgt_u16, uint32_t, (uint32_t, uint32_t))
  263 +DEF_HELPER_1_2(neon_cgt_s16, uint32_t, (uint32_t, uint32_t))
  264 +DEF_HELPER_1_2(neon_cgt_u32, uint32_t, (uint32_t, uint32_t))
  265 +DEF_HELPER_1_2(neon_cgt_s32, uint32_t, (uint32_t, uint32_t))
  266 +DEF_HELPER_1_2(neon_cge_u8, uint32_t, (uint32_t, uint32_t))
  267 +DEF_HELPER_1_2(neon_cge_s8, uint32_t, (uint32_t, uint32_t))
  268 +DEF_HELPER_1_2(neon_cge_u16, uint32_t, (uint32_t, uint32_t))
  269 +DEF_HELPER_1_2(neon_cge_s16, uint32_t, (uint32_t, uint32_t))
  270 +DEF_HELPER_1_2(neon_cge_u32, uint32_t, (uint32_t, uint32_t))
  271 +DEF_HELPER_1_2(neon_cge_s32, uint32_t, (uint32_t, uint32_t))
  272 +
  273 +DEF_HELPER_1_2(neon_min_u8, uint32_t, (uint32_t, uint32_t))
  274 +DEF_HELPER_1_2(neon_min_s8, uint32_t, (uint32_t, uint32_t))
  275 +DEF_HELPER_1_2(neon_min_u16, uint32_t, (uint32_t, uint32_t))
  276 +DEF_HELPER_1_2(neon_min_s16, uint32_t, (uint32_t, uint32_t))
  277 +DEF_HELPER_1_2(neon_min_u32, uint32_t, (uint32_t, uint32_t))
  278 +DEF_HELPER_1_2(neon_min_s32, uint32_t, (uint32_t, uint32_t))
  279 +DEF_HELPER_1_2(neon_max_u8, uint32_t, (uint32_t, uint32_t))
  280 +DEF_HELPER_1_2(neon_max_s8, uint32_t, (uint32_t, uint32_t))
  281 +DEF_HELPER_1_2(neon_max_u16, uint32_t, (uint32_t, uint32_t))
  282 +DEF_HELPER_1_2(neon_max_s16, uint32_t, (uint32_t, uint32_t))
  283 +DEF_HELPER_1_2(neon_max_u32, uint32_t, (uint32_t, uint32_t))
  284 +DEF_HELPER_1_2(neon_max_s32, uint32_t, (uint32_t, uint32_t))
  285 +DEF_HELPER_1_2(neon_pmin_u8, uint32_t, (uint32_t, uint32_t))
  286 +DEF_HELPER_1_2(neon_pmin_s8, uint32_t, (uint32_t, uint32_t))
  287 +DEF_HELPER_1_2(neon_pmin_u16, uint32_t, (uint32_t, uint32_t))
  288 +DEF_HELPER_1_2(neon_pmin_s16, uint32_t, (uint32_t, uint32_t))
  289 +DEF_HELPER_1_2(neon_pmin_u32, uint32_t, (uint32_t, uint32_t))
  290 +DEF_HELPER_1_2(neon_pmin_s32, uint32_t, (uint32_t, uint32_t))
  291 +DEF_HELPER_1_2(neon_pmax_u8, uint32_t, (uint32_t, uint32_t))
  292 +DEF_HELPER_1_2(neon_pmax_s8, uint32_t, (uint32_t, uint32_t))
  293 +DEF_HELPER_1_2(neon_pmax_u16, uint32_t, (uint32_t, uint32_t))
  294 +DEF_HELPER_1_2(neon_pmax_s16, uint32_t, (uint32_t, uint32_t))
  295 +DEF_HELPER_1_2(neon_pmax_u32, uint32_t, (uint32_t, uint32_t))
  296 +DEF_HELPER_1_2(neon_pmax_s32, uint32_t, (uint32_t, uint32_t))
  297 +
  298 +DEF_HELPER_1_2(neon_abd_u8, uint32_t, (uint32_t, uint32_t))
  299 +DEF_HELPER_1_2(neon_abd_s8, uint32_t, (uint32_t, uint32_t))
  300 +DEF_HELPER_1_2(neon_abd_u16, uint32_t, (uint32_t, uint32_t))
  301 +DEF_HELPER_1_2(neon_abd_s16, uint32_t, (uint32_t, uint32_t))
  302 +DEF_HELPER_1_2(neon_abd_u32, uint32_t, (uint32_t, uint32_t))
  303 +DEF_HELPER_1_2(neon_abd_s32, uint32_t, (uint32_t, uint32_t))
  304 +
  305 +DEF_HELPER_1_2(neon_shl_u8, uint32_t, (uint32_t, uint32_t))
  306 +DEF_HELPER_1_2(neon_shl_s8, uint32_t, (uint32_t, uint32_t))
  307 +DEF_HELPER_1_2(neon_shl_u16, uint32_t, (uint32_t, uint32_t))
  308 +DEF_HELPER_1_2(neon_shl_s16, uint32_t, (uint32_t, uint32_t))
  309 +DEF_HELPER_1_2(neon_shl_u32, uint32_t, (uint32_t, uint32_t))
  310 +DEF_HELPER_1_2(neon_shl_s32, uint32_t, (uint32_t, uint32_t))
  311 +DEF_HELPER_1_2(neon_shl_u64, uint64_t, (uint64_t, uint64_t))
  312 +DEF_HELPER_1_2(neon_shl_s64, uint64_t, (uint64_t, uint64_t))
  313 +DEF_HELPER_1_2(neon_rshl_u8, uint32_t, (uint32_t, uint32_t))
  314 +DEF_HELPER_1_2(neon_rshl_s8, uint32_t, (uint32_t, uint32_t))
  315 +DEF_HELPER_1_2(neon_rshl_u16, uint32_t, (uint32_t, uint32_t))
  316 +DEF_HELPER_1_2(neon_rshl_s16, uint32_t, (uint32_t, uint32_t))
  317 +DEF_HELPER_1_2(neon_rshl_u32, uint32_t, (uint32_t, uint32_t))
  318 +DEF_HELPER_1_2(neon_rshl_s32, uint32_t, (uint32_t, uint32_t))
  319 +DEF_HELPER_1_2(neon_rshl_u64, uint64_t, (uint64_t, uint64_t))
  320 +DEF_HELPER_1_2(neon_rshl_s64, uint64_t, (uint64_t, uint64_t))
  321 +DEF_HELPER_1_3(neon_qshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  322 +DEF_HELPER_1_3(neon_qshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  323 +DEF_HELPER_1_3(neon_qshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  324 +DEF_HELPER_1_3(neon_qshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  325 +DEF_HELPER_1_3(neon_qshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  326 +DEF_HELPER_1_3(neon_qshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  327 +DEF_HELPER_1_3(neon_qshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  328 +DEF_HELPER_1_3(neon_qshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  329 +DEF_HELPER_1_3(neon_qrshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  330 +DEF_HELPER_1_3(neon_qrshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  331 +DEF_HELPER_1_3(neon_qrshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  332 +DEF_HELPER_1_3(neon_qrshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  333 +DEF_HELPER_1_3(neon_qrshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  334 +DEF_HELPER_1_3(neon_qrshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  335 +DEF_HELPER_1_3(neon_qrshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  336 +DEF_HELPER_1_3(neon_qrshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  337 +
  338 +DEF_HELPER_1_2(neon_add_u8, uint32_t, (uint32_t, uint32_t))
  339 +DEF_HELPER_1_2(neon_add_u16, uint32_t, (uint32_t, uint32_t))
  340 +DEF_HELPER_1_2(neon_padd_u8, uint32_t, (uint32_t, uint32_t))
  341 +DEF_HELPER_1_2(neon_padd_u16, uint32_t, (uint32_t, uint32_t))
  342 +DEF_HELPER_1_2(neon_sub_u8, uint32_t, (uint32_t, uint32_t))
  343 +DEF_HELPER_1_2(neon_sub_u16, uint32_t, (uint32_t, uint32_t))
  344 +DEF_HELPER_1_2(neon_mul_u8, uint32_t, (uint32_t, uint32_t))
  345 +DEF_HELPER_1_2(neon_mul_u16, uint32_t, (uint32_t, uint32_t))
  346 +DEF_HELPER_1_2(neon_mul_p8, uint32_t, (uint32_t, uint32_t))
  347 +
  348 +DEF_HELPER_1_2(neon_tst_u8, uint32_t, (uint32_t, uint32_t))
  349 +DEF_HELPER_1_2(neon_tst_u16, uint32_t, (uint32_t, uint32_t))
  350 +DEF_HELPER_1_2(neon_tst_u32, uint32_t, (uint32_t, uint32_t))
  351 +DEF_HELPER_1_2(neon_ceq_u8, uint32_t, (uint32_t, uint32_t))
  352 +DEF_HELPER_1_2(neon_ceq_u16, uint32_t, (uint32_t, uint32_t))
  353 +DEF_HELPER_1_2(neon_ceq_u32, uint32_t, (uint32_t, uint32_t))
  354 +
  355 +DEF_HELPER_1_1(neon_abs_s8, uint32_t, (uint32_t))
  356 +DEF_HELPER_1_1(neon_abs_s16, uint32_t, (uint32_t))
  357 +DEF_HELPER_1_1(neon_clz_u8, uint32_t, (uint32_t))
  358 +DEF_HELPER_1_1(neon_clz_u16, uint32_t, (uint32_t))
  359 +DEF_HELPER_1_1(neon_cls_s8, uint32_t, (uint32_t))
  360 +DEF_HELPER_1_1(neon_cls_s16, uint32_t, (uint32_t))
  361 +DEF_HELPER_1_1(neon_cls_s32, uint32_t, (uint32_t))
  362 +DEF_HELPER_1_1(neon_cnt_u8, uint32_t, (uint32_t))
  363 +
  364 +DEF_HELPER_1_3(neon_qdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  365 +DEF_HELPER_1_3(neon_qrdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  366 +DEF_HELPER_1_3(neon_qdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  367 +DEF_HELPER_1_3(neon_qrdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  368 +
  369 +DEF_HELPER_1_1(neon_narrow_u8, uint32_t, (uint64_t))
  370 +DEF_HELPER_1_1(neon_narrow_u16, uint32_t, (uint64_t))
  371 +DEF_HELPER_1_2(neon_narrow_sat_u8, uint32_t, (CPUState *, uint64_t))
  372 +DEF_HELPER_1_2(neon_narrow_sat_s8, uint32_t, (CPUState *, uint64_t))
  373 +DEF_HELPER_1_2(neon_narrow_sat_u16, uint32_t, (CPUState *, uint64_t))
  374 +DEF_HELPER_1_2(neon_narrow_sat_s16, uint32_t, (CPUState *, uint64_t))
  375 +DEF_HELPER_1_2(neon_narrow_sat_u32, uint32_t, (CPUState *, uint64_t))
  376 +DEF_HELPER_1_2(neon_narrow_sat_s32, uint32_t, (CPUState *, uint64_t))
  377 +DEF_HELPER_1_1(neon_narrow_high_u8, uint32_t, (uint64_t))
  378 +DEF_HELPER_1_1(neon_narrow_high_u16, uint32_t, (uint64_t))
  379 +DEF_HELPER_1_1(neon_narrow_round_high_u8, uint32_t, (uint64_t))
  380 +DEF_HELPER_1_1(neon_narrow_round_high_u16, uint32_t, (uint64_t))
  381 +DEF_HELPER_1_1(neon_widen_u8, uint64_t, (uint32_t))
  382 +DEF_HELPER_1_1(neon_widen_s8, uint64_t, (uint32_t))
  383 +DEF_HELPER_1_1(neon_widen_u16, uint64_t, (uint32_t))
  384 +DEF_HELPER_1_1(neon_widen_s16, uint64_t, (uint32_t))
  385 +
  386 +DEF_HELPER_1_2(neon_addl_u16, uint64_t, (uint64_t, uint64_t))
  387 +DEF_HELPER_1_2(neon_addl_u32, uint64_t, (uint64_t, uint64_t))
  388 +DEF_HELPER_1_2(neon_paddl_u16, uint64_t, (uint64_t, uint64_t))
  389 +DEF_HELPER_1_2(neon_paddl_u32, uint64_t, (uint64_t, uint64_t))
  390 +DEF_HELPER_1_2(neon_subl_u16, uint64_t, (uint64_t, uint64_t))
  391 +DEF_HELPER_1_2(neon_subl_u32, uint64_t, (uint64_t, uint64_t))
  392 +DEF_HELPER_1_3(neon_addl_saturate_s32, uint64_t, (CPUState *, uint64_t, uint64_t))
  393 +DEF_HELPER_1_3(neon_addl_saturate_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  394 +DEF_HELPER_1_2(neon_abdl_u16, uint64_t, (uint32_t, uint32_t))
  395 +DEF_HELPER_1_2(neon_abdl_s16, uint64_t, (uint32_t, uint32_t))
  396 +DEF_HELPER_1_2(neon_abdl_u32, uint64_t, (uint32_t, uint32_t))
  397 +DEF_HELPER_1_2(neon_abdl_s32, uint64_t, (uint32_t, uint32_t))
  398 +DEF_HELPER_1_2(neon_abdl_u64, uint64_t, (uint32_t, uint32_t))
  399 +DEF_HELPER_1_2(neon_abdl_s64, uint64_t, (uint32_t, uint32_t))
  400 +DEF_HELPER_1_2(neon_mull_u8, uint64_t, (uint32_t, uint32_t))
  401 +DEF_HELPER_1_2(neon_mull_s8, uint64_t, (uint32_t, uint32_t))
  402 +DEF_HELPER_1_2(neon_mull_u16, uint64_t, (uint32_t, uint32_t))
  403 +DEF_HELPER_1_2(neon_mull_s16, uint64_t, (uint32_t, uint32_t))
  404 +
  405 +DEF_HELPER_1_1(neon_negl_u16, uint64_t, (uint64_t))
  406 +DEF_HELPER_1_1(neon_negl_u32, uint64_t, (uint64_t))
  407 +DEF_HELPER_1_1(neon_negl_u64, uint64_t, (uint64_t))
  408 +
  409 +DEF_HELPER_1_2(neon_qabs_s8, uint32_t, (CPUState *, uint32_t))
  410 +DEF_HELPER_1_2(neon_qabs_s16, uint32_t, (CPUState *, uint32_t))
  411 +DEF_HELPER_1_2(neon_qabs_s32, uint32_t, (CPUState *, uint32_t))
  412 +DEF_HELPER_1_2(neon_qneg_s8, uint32_t, (CPUState *, uint32_t))
  413 +DEF_HELPER_1_2(neon_qneg_s16, uint32_t, (CPUState *, uint32_t))
  414 +DEF_HELPER_1_2(neon_qneg_s32, uint32_t, (CPUState *, uint32_t))
  415 +
  416 +DEF_HELPER_0_0(neon_trn_u8, void, (void))
  417 +DEF_HELPER_0_0(neon_trn_u16, void, (void))
  418 +DEF_HELPER_0_0(neon_unzip_u8, void, (void))
  419 +DEF_HELPER_0_0(neon_zip_u8, void, (void))
  420 +DEF_HELPER_0_0(neon_zip_u16, void, (void))
  421 +
  422 +DEF_HELPER_1_2(neon_min_f32, uint32_t, (uint32_t, uint32_t))
  423 +DEF_HELPER_1_2(neon_max_f32, uint32_t, (uint32_t, uint32_t))
  424 +DEF_HELPER_1_2(neon_abd_f32, uint32_t, (uint32_t, uint32_t))
  425 +DEF_HELPER_1_2(neon_add_f32, uint32_t, (uint32_t, uint32_t))
  426 +DEF_HELPER_1_2(neon_sub_f32, uint32_t, (uint32_t, uint32_t))
  427 +DEF_HELPER_1_2(neon_mul_f32, uint32_t, (uint32_t, uint32_t))
  428 +DEF_HELPER_1_2(neon_ceq_f32, uint32_t, (uint32_t, uint32_t))
  429 +DEF_HELPER_1_2(neon_cge_f32, uint32_t, (uint32_t, uint32_t))
  430 +DEF_HELPER_1_2(neon_cgt_f32, uint32_t, (uint32_t, uint32_t))
  431 +DEF_HELPER_1_2(neon_acge_f32, uint32_t, (uint32_t, uint32_t))
  432 +DEF_HELPER_1_2(neon_acgt_f32, uint32_t, (uint32_t, uint32_t))
  433 +
226 434 #undef DEF_HELPER
227 435 #undef DEF_HELPER_0_0
228 436 #undef DEF_HELPER_0_1
... ...
target-arm/neon_helper.c 0 โ†’ 100644
  1 +#include <stdlib.h>
  2 +#include <stdio.h>
  3 +
  4 +#include "cpu.h"
  5 +#include "exec-all.h"
  6 +#include "helpers.h"
  7 +
  8 +#define SIGNBIT (uint32_t)0x80000000
  9 +#define SIGNBIT64 ((uint64_t)1 << 63)
  10 +
  11 +#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
  12 +
  13 +static float_status neon_float_status;
  14 +#define NFS &neon_float_status
  15 +
  16 +/* Helper routines to perform bitwise copies between float and int. */
  17 +static inline float32 vfp_itos(uint32_t i)
  18 +{
  19 + union {
  20 + uint32_t i;
  21 + float32 s;
  22 + } v;
  23 +
  24 + v.i = i;
  25 + return v.s;
  26 +}
  27 +
  28 +static inline uint32_t vfp_stoi(float32 s)
  29 +{
  30 + union {
  31 + uint32_t i;
  32 + float32 s;
  33 + } v;
  34 +
  35 + v.s = s;
  36 + return v.i;
  37 +}
  38 +
  39 +#define NEON_TYPE1(name, type) \
  40 +typedef struct \
  41 +{ \
  42 + type v1; \
  43 +} neon_##name;
  44 +#ifdef WORDS_BIGENDIAN
  45 +#define NEON_TYPE2(name, type) \
  46 +typedef struct \
  47 +{ \
  48 + type v2; \
  49 + type v1; \
  50 +} neon_##name;
  51 +#define NEON_TYPE4(name, type) \
  52 +typedef struct \
  53 +{ \
  54 + type v4; \
  55 + type v3; \
  56 + type v2; \
  57 + type v1; \
  58 +} neon_##name;
  59 +#else
  60 +#define NEON_TYPE2(name, type) \
  61 +typedef struct \
  62 +{ \
  63 + type v1; \
  64 + type v2; \
  65 +} neon_##name;
  66 +#define NEON_TYPE4(name, type) \
  67 +typedef struct \
  68 +{ \
  69 + type v1; \
  70 + type v2; \
  71 + type v3; \
  72 + type v4; \
  73 +} neon_##name;
  74 +#endif
  75 +
  76 +NEON_TYPE4(s8, int8_t)
  77 +NEON_TYPE4(u8, uint8_t)
  78 +NEON_TYPE2(s16, int16_t)
  79 +NEON_TYPE2(u16, uint16_t)
  80 +NEON_TYPE1(s32, int32_t)
  81 +NEON_TYPE1(u32, uint32_t)
  82 +#undef NEON_TYPE4
  83 +#undef NEON_TYPE2
  84 +#undef NEON_TYPE1
  85 +
  86 +/* Copy from a uint32_t to a vector structure type. */
  87 +#define NEON_UNPACK(vtype, dest, val) do { \
  88 + union { \
  89 + vtype v; \
  90 + uint32_t i; \
  91 + } conv_u; \
  92 + conv_u.i = (val); \
  93 + dest = conv_u.v; \
  94 + } while(0)
  95 +
  96 +/* Copy from a vector structure type to a uint32_t. */
  97 +#define NEON_PACK(vtype, dest, val) do { \
  98 + union { \
  99 + vtype v; \
  100 + uint32_t i; \
  101 + } conv_u; \
  102 + conv_u.v = (val); \
  103 + dest = conv_u.i; \
  104 + } while(0)
  105 +
  106 +#define NEON_DO1 \
  107 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
  108 +#define NEON_DO2 \
  109 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  110 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
  111 +#define NEON_DO4 \
  112 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  113 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
  114 + NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
  115 + NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
  116 +
  117 +#define NEON_VOP_BODY(vtype, n) \
  118 +{ \
  119 + uint32_t res; \
  120 + vtype vsrc1; \
  121 + vtype vsrc2; \
  122 + vtype vdest; \
  123 + NEON_UNPACK(vtype, vsrc1, arg1); \
  124 + NEON_UNPACK(vtype, vsrc2, arg2); \
  125 + NEON_DO##n; \
  126 + NEON_PACK(vtype, res, vdest); \
  127 + return res; \
  128 +}
  129 +
  130 +#define NEON_VOP(name, vtype, n) \
  131 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  132 +NEON_VOP_BODY(vtype, n)
  133 +
  134 +#define NEON_VOP_ENV(name, vtype, n) \
  135 +uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
  136 +NEON_VOP_BODY(vtype, n)
  137 +
  138 +/* Pairwise operations. */
  139 +/* For 32-bit elements each segment only contains a single element, so
  140 + the elementwise and pairwise operations are the same. */
  141 +#define NEON_PDO2 \
  142 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  143 + NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
  144 +#define NEON_PDO4 \
  145 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  146 + NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
  147 + NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
  148 + NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
  149 +
  150 +#define NEON_POP(name, vtype, n) \
  151 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  152 +{ \
  153 + uint32_t res; \
  154 + vtype vsrc1; \
  155 + vtype vsrc2; \
  156 + vtype vdest; \
  157 + NEON_UNPACK(vtype, vsrc1, arg1); \
  158 + NEON_UNPACK(vtype, vsrc2, arg2); \
  159 + NEON_PDO##n; \
  160 + NEON_PACK(vtype, res, vdest); \
  161 + return res; \
  162 +}
  163 +
  164 +/* Unary operators. */
  165 +#define NEON_VOP1(name, vtype, n) \
  166 +uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
  167 +{ \
  168 + vtype vsrc1; \
  169 + vtype vdest; \
  170 + NEON_UNPACK(vtype, vsrc1, arg); \
  171 + NEON_DO##n; \
  172 + NEON_PACK(vtype, arg, vdest); \
  173 + return arg; \
  174 +}
  175 +
  176 +
  177 +#define NEON_USAT(dest, src1, src2, type) do { \
  178 + uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  179 + if (tmp != (type)tmp) { \
  180 + SET_QC(); \
  181 + dest = ~0; \
  182 + } else { \
  183 + dest = tmp; \
  184 + }} while(0)
  185 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  186 +NEON_VOP_ENV(qadd_u8, neon_u8, 4)
  187 +#undef NEON_FN
  188 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  189 +NEON_VOP_ENV(qadd_u16, neon_u16, 2)
  190 +#undef NEON_FN
  191 +#undef NEON_USAT
  192 +
  193 +#define NEON_SSAT(dest, src1, src2, type) do { \
  194 + int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  195 + if (tmp != (type)tmp) { \
  196 + SET_QC(); \
  197 + if (src2 > 0) { \
  198 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  199 + } else { \
  200 + tmp = 1 << (sizeof(type) * 8 - 1); \
  201 + } \
  202 + } \
  203 + dest = tmp; \
  204 + } while(0)
  205 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  206 +NEON_VOP_ENV(qadd_s8, neon_s8, 4)
  207 +#undef NEON_FN
  208 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  209 +NEON_VOP_ENV(qadd_s16, neon_s16, 2)
  210 +#undef NEON_FN
  211 +#undef NEON_SSAT
  212 +
  213 +#define NEON_USAT(dest, src1, src2, type) do { \
  214 + uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  215 + if (tmp != (type)tmp) { \
  216 + SET_QC(); \
  217 + dest = 0; \
  218 + } else { \
  219 + dest = tmp; \
  220 + }} while(0)
  221 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  222 +NEON_VOP_ENV(qsub_u8, neon_u8, 4)
  223 +#undef NEON_FN
  224 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  225 +NEON_VOP_ENV(qsub_u16, neon_u16, 2)
  226 +#undef NEON_FN
  227 +#undef NEON_USAT
  228 +
  229 +#define NEON_SSAT(dest, src1, src2, type) do { \
  230 + int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  231 + if (tmp != (type)tmp) { \
  232 + SET_QC(); \
  233 + if (src2 < 0) { \
  234 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  235 + } else { \
  236 + tmp = 1 << (sizeof(type) * 8 - 1); \
  237 + } \
  238 + } \
  239 + dest = tmp; \
  240 + } while(0)
  241 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  242 +NEON_VOP_ENV(qsub_s8, neon_s8, 4)
  243 +#undef NEON_FN
  244 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  245 +NEON_VOP_ENV(qsub_s16, neon_s16, 2)
  246 +#undef NEON_FN
  247 +#undef NEON_SSAT
  248 +
  249 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
  250 +NEON_VOP(hadd_s8, neon_s8, 4)
  251 +NEON_VOP(hadd_u8, neon_u8, 4)
  252 +NEON_VOP(hadd_s16, neon_s16, 2)
  253 +NEON_VOP(hadd_u16, neon_u16, 2)
  254 +#undef NEON_FN
  255 +
  256 +int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
  257 +{
  258 + int32_t dest;
  259 +
  260 + dest = (src1 >> 1) + (src2 >> 1);
  261 + if (src1 & src2 & 1)
  262 + dest++;
  263 + return dest;
  264 +}
  265 +
  266 +uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
  267 +{
  268 + uint32_t dest;
  269 +
  270 + dest = (src1 >> 1) + (src2 >> 1);
  271 + if (src1 & src2 & 1)
  272 + dest++;
  273 + return dest;
  274 +}
  275 +
  276 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
  277 +NEON_VOP(rhadd_s8, neon_s8, 4)
  278 +NEON_VOP(rhadd_u8, neon_u8, 4)
  279 +NEON_VOP(rhadd_s16, neon_s16, 2)
  280 +NEON_VOP(rhadd_u16, neon_u16, 2)
  281 +#undef NEON_FN
  282 +
  283 +int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
  284 +{
  285 + int32_t dest;
  286 +
  287 + dest = (src1 >> 1) + (src2 >> 1);
  288 + if ((src1 | src2) & 1)
  289 + dest++;
  290 + return dest;
  291 +}
  292 +
  293 +uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
  294 +{
  295 + uint32_t dest;
  296 +
  297 + dest = (src1 >> 1) + (src2 >> 1);
  298 + if ((src1 | src2) & 1)
  299 + dest++;
  300 + return dest;
  301 +}
  302 +
  303 +#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
  304 +NEON_VOP(hsub_s8, neon_s8, 4)
  305 +NEON_VOP(hsub_u8, neon_u8, 4)
  306 +NEON_VOP(hsub_s16, neon_s16, 2)
  307 +NEON_VOP(hsub_u16, neon_u16, 2)
  308 +#undef NEON_FN
  309 +
  310 +int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
  311 +{
  312 + int32_t dest;
  313 +
  314 + dest = (src1 >> 1) - (src2 >> 1);
  315 + if ((~src1) & src2 & 1)
  316 + dest--;
  317 + return dest;
  318 +}
  319 +
  320 +uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
  321 +{
  322 + uint32_t dest;
  323 +
  324 + dest = (src1 >> 1) - (src2 >> 1);
  325 + if ((~src1) & src2 & 1)
  326 + dest--;
  327 + return dest;
  328 +}
  329 +
  330 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
  331 +NEON_VOP(cgt_s8, neon_s8, 4)
  332 +NEON_VOP(cgt_u8, neon_u8, 4)
  333 +NEON_VOP(cgt_s16, neon_s16, 2)
  334 +NEON_VOP(cgt_u16, neon_u16, 2)
  335 +NEON_VOP(cgt_s32, neon_s32, 1)
  336 +NEON_VOP(cgt_u32, neon_u32, 1)
  337 +#undef NEON_FN
  338 +
  339 +#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
  340 +NEON_VOP(cge_s8, neon_s8, 4)
  341 +NEON_VOP(cge_u8, neon_u8, 4)
  342 +NEON_VOP(cge_s16, neon_s16, 2)
  343 +NEON_VOP(cge_u16, neon_u16, 2)
  344 +NEON_VOP(cge_s32, neon_s32, 1)
  345 +NEON_VOP(cge_u32, neon_u32, 1)
  346 +#undef NEON_FN
  347 +
  348 +#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
  349 +NEON_VOP(min_s8, neon_s8, 4)
  350 +NEON_VOP(min_u8, neon_u8, 4)
  351 +NEON_VOP(min_s16, neon_s16, 2)
  352 +NEON_VOP(min_u16, neon_u16, 2)
  353 +NEON_VOP(min_s32, neon_s32, 1)
  354 +NEON_VOP(min_u32, neon_u32, 1)
  355 +NEON_POP(pmin_s8, neon_s8, 4)
  356 +NEON_POP(pmin_u8, neon_u8, 4)
  357 +NEON_POP(pmin_s16, neon_s16, 2)
  358 +NEON_POP(pmin_u16, neon_u16, 2)
  359 +#undef NEON_FN
  360 +
  361 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
  362 +NEON_VOP(max_s8, neon_s8, 4)
  363 +NEON_VOP(max_u8, neon_u8, 4)
  364 +NEON_VOP(max_s16, neon_s16, 2)
  365 +NEON_VOP(max_u16, neon_u16, 2)
  366 +NEON_VOP(max_s32, neon_s32, 1)
  367 +NEON_VOP(max_u32, neon_u32, 1)
  368 +NEON_POP(pmax_s8, neon_s8, 4)
  369 +NEON_POP(pmax_u8, neon_u8, 4)
  370 +NEON_POP(pmax_s16, neon_s16, 2)
  371 +NEON_POP(pmax_u16, neon_u16, 2)
  372 +#undef NEON_FN
  373 +
  374 +#define NEON_FN(dest, src1, src2) \
  375 + dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
  376 +NEON_VOP(abd_s8, neon_s8, 4)
  377 +NEON_VOP(abd_u8, neon_u8, 4)
  378 +NEON_VOP(abd_s16, neon_s16, 2)
  379 +NEON_VOP(abd_u16, neon_u16, 2)
  380 +NEON_VOP(abd_s32, neon_s32, 1)
  381 +NEON_VOP(abd_u32, neon_u32, 1)
  382 +#undef NEON_FN
  383 +
  384 +#define NEON_FN(dest, src1, src2) do { \
  385 + int8_t tmp; \
  386 + tmp = (int8_t)src2; \
  387 + if (tmp >= sizeof(src1) * 8 || tmp <= -sizeof(src1) * 8) { \
  388 + dest = 0; \
  389 + } else if (tmp < 0) { \
  390 + dest = src1 >> -tmp; \
  391 + } else { \
  392 + dest = src1 << tmp; \
  393 + }} while (0)
  394 +NEON_VOP(shl_u8, neon_u8, 4)
  395 +NEON_VOP(shl_u16, neon_u16, 2)
  396 +NEON_VOP(shl_u32, neon_u32, 1)
  397 +#undef NEON_FN
  398 +
  399 +uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
  400 +{
  401 + int8_t shift = (int8_t)shiftop;
  402 + if (shift >= 64 || shift <= -64) {
  403 + val = 0;
  404 + } else if (shift < 0) {
  405 + val >>= -shift;
  406 + } else {
  407 + val <<= shift;
  408 + }
  409 + return val;
  410 +}
  411 +
  412 +#define NEON_FN(dest, src1, src2) do { \
  413 + int8_t tmp; \
  414 + tmp = (int8_t)src2; \
  415 + if (tmp >= sizeof(src1) * 8) { \
  416 + dest = 0; \
  417 + } else if (tmp <= -sizeof(src1) * 8) { \
  418 + dest = src1 >> (sizeof(src1) * 8 - 1); \
  419 + } else if (tmp < 0) { \
  420 + dest = src1 >> -tmp; \
  421 + } else { \
  422 + dest = src1 << tmp; \
  423 + }} while (0)
  424 +NEON_VOP(shl_s8, neon_s8, 4)
  425 +NEON_VOP(shl_s16, neon_s16, 2)
  426 +NEON_VOP(shl_s32, neon_s32, 1)
  427 +#undef NEON_FN
  428 +
  429 +uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
  430 +{
  431 + int8_t shift = (int8_t)shiftop;
  432 + int64_t val = valop;
  433 + if (shift >= 64) {
  434 + val = 0;
  435 + } else if (shift <= -64) {
  436 + val >>= 63;
  437 + } else if (shift < 0) {
  438 + val >>= -shift;
  439 + } else {
  440 + val <<= shift;
  441 + }
  442 + return val;
  443 +}
  444 +
  445 +#define NEON_FN(dest, src1, src2) do { \
  446 + int8_t tmp; \
  447 + tmp = (int8_t)src2; \
  448 + if (tmp >= sizeof(src1) * 8) { \
  449 + dest = 0; \
  450 + } else if (tmp < -sizeof(src1) * 8) { \
  451 + dest >>= sizeof(src1) * 8 - 1; \
  452 + } else if (tmp == -sizeof(src1) * 8) { \
  453 + dest = src1 >> (tmp - 1); \
  454 + dest++; \
  455 + src2 >>= 1; \
  456 + } else if (tmp < 0) { \
  457 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  458 + } else { \
  459 + dest = src1 << tmp; \
  460 + }} while (0)
  461 +NEON_VOP(rshl_s8, neon_s8, 4)
  462 +NEON_VOP(rshl_s16, neon_s16, 2)
  463 +NEON_VOP(rshl_s32, neon_s32, 1)
  464 +#undef NEON_FN
  465 +
  466 +uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
  467 +{
  468 + int8_t shift = (int8_t)shiftop;
  469 + int64_t val = valop;
  470 + if (shift >= 64) {
  471 + val = 0;
  472 + } else if (shift < -64) {
  473 + val >>= 63;
  474 + } else if (shift == -63) {
  475 + val >>= 63;
  476 + val++;
  477 + val >>= 1;
  478 + } else if (shift < 0) {
  479 + val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
  480 + } else {
  481 + val <<= shift;
  482 + }
  483 + return val;
  484 +}
  485 +
  486 +#define NEON_FN(dest, src1, src2) do { \
  487 + int8_t tmp; \
  488 + tmp = (int8_t)src2; \
  489 + if (tmp >= sizeof(src1) * 8 || tmp < -sizeof(src1) * 8) { \
  490 + dest = 0; \
  491 + } else if (tmp == -sizeof(src1) * 8) { \
  492 + dest = src1 >> (tmp - 1); \
  493 + } else if (tmp < 0) { \
  494 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  495 + } else { \
  496 + dest = src1 << tmp; \
  497 + }} while (0)
  498 +NEON_VOP(rshl_u8, neon_u8, 4)
  499 +NEON_VOP(rshl_u16, neon_u16, 2)
  500 +NEON_VOP(rshl_u32, neon_u32, 1)
  501 +#undef NEON_FN
  502 +
  503 +uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
  504 +{
  505 + int8_t shift = (uint8_t)shiftop;
  506 + if (shift >= 64 || shift < 64) {
  507 + val = 0;
  508 + } else if (shift == -64) {
  509 + /* Rounding a 1-bit result just preserves that bit. */
  510 + val >>= 63;
  511 + } if (shift < 0) {
  512 + val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
  513 + val >>= -shift;
  514 + } else {
  515 + val <<= shift;
  516 + }
  517 + return val;
  518 +}
  519 +
  520 +#define NEON_FN(dest, src1, src2) do { \
  521 + int8_t tmp; \
  522 + tmp = (int8_t)src2; \
  523 + if (tmp >= sizeof(src1) * 8) { \
  524 + if (src1) { \
  525 + SET_QC(); \
  526 + dest = ~0; \
  527 + } else { \
  528 + dest = 0; \
  529 + } \
  530 + } else if (tmp <= -sizeof(src1) * 8) { \
  531 + dest = 0; \
  532 + } else if (tmp < 0) { \
  533 + dest = src1 >> -tmp; \
  534 + } else { \
  535 + dest = src1 << tmp; \
  536 + if ((dest >> tmp) != src1) { \
  537 + SET_QC(); \
  538 + dest = ~0; \
  539 + } \
  540 + }} while (0)
  541 +NEON_VOP_ENV(qshl_u8, neon_u8, 4)
  542 +NEON_VOP_ENV(qshl_u16, neon_u16, 2)
  543 +NEON_VOP_ENV(qshl_u32, neon_u32, 1)
  544 +#undef NEON_FN
  545 +
  546 +uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  547 +{
  548 + int8_t shift = (int8_t)shiftop;
  549 + if (shift >= 64) {
  550 + if (val) {
  551 + val = ~(uint64_t)0;
  552 + SET_QC();
  553 + } else {
  554 + val = 0;
  555 + }
  556 + } else if (shift <= -64) {
  557 + val = 0;
  558 + } else if (shift < 0) {
  559 + val >>= -shift;
  560 + } else {
  561 + uint64_t tmp = val;
  562 + val <<= shift;
  563 + if ((val >> shift) != tmp) {
  564 + SET_QC();
  565 + val = ~(uint64_t)0;
  566 + }
  567 + }
  568 + return val;
  569 +}
  570 +
  571 +#define NEON_FN(dest, src1, src2) do { \
  572 + int8_t tmp; \
  573 + tmp = (int8_t)src2; \
  574 + if (tmp >= sizeof(src1) * 8) { \
  575 + if (src1) \
  576 + SET_QC(); \
  577 + dest = src1 >> 31; \
  578 + } else if (tmp <= -sizeof(src1) * 8) { \
  579 + dest = src1 >> 31; \
  580 + } else if (tmp < 0) { \
  581 + dest = src1 >> -tmp; \
  582 + } else { \
  583 + dest = src1 << tmp; \
  584 + if ((dest >> tmp) != src1) { \
  585 + SET_QC(); \
  586 + dest = src2 >> 31; \
  587 + } \
  588 + }} while (0)
  589 +NEON_VOP_ENV(qshl_s8, neon_s8, 4)
  590 +NEON_VOP_ENV(qshl_s16, neon_s16, 2)
  591 +NEON_VOP_ENV(qshl_s32, neon_s32, 1)
  592 +#undef NEON_FN
  593 +
  594 +uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  595 +{
  596 + int8_t shift = (uint8_t)shiftop;
  597 + int64_t val = valop;
  598 + if (shift >= 64) {
  599 + if (val) {
  600 + SET_QC();
  601 + val = (val >> 63) & ~SIGNBIT64;
  602 + }
  603 + } else if (shift <= 64) {
  604 + val >>= 63;
  605 + } else if (shift < 0) {
  606 + val >>= -shift;
  607 + } else {
  608 + int64_t tmp = val;
  609 + val <<= shift;
  610 + if ((val >> shift) != tmp) {
  611 + SET_QC();
  612 + val = (tmp >> 63) ^ ~SIGNBIT64;
  613 + }
  614 + }
  615 + return val;
  616 +}
  617 +
  618 +
  619 +/* FIXME: This is wrong. */
  620 +#define NEON_FN(dest, src1, src2) do { \
  621 + int8_t tmp; \
  622 + tmp = (int8_t)src2; \
  623 + if (tmp < 0) { \
  624 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  625 + } else { \
  626 + dest = src1 << tmp; \
  627 + if ((dest >> tmp) != src1) { \
  628 + SET_QC(); \
  629 + dest = ~0; \
  630 + } \
  631 + }} while (0)
  632 +NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
  633 +NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
  634 +NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
  635 +#undef NEON_FN
  636 +
  637 +uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  638 +{
  639 + int8_t shift = (int8_t)shiftop;
  640 + if (shift < 0) {
  641 + val = (val + (1 << (-1 - shift))) >> -shift;
  642 + } else { \
  643 + uint64_t tmp = val;
  644 + val <<= shift;
  645 + if ((val >> shift) != tmp) {
  646 + SET_QC();
  647 + val = ~0;
  648 + }
  649 + }
  650 + return val;
  651 +}
  652 +
  653 +#define NEON_FN(dest, src1, src2) do { \
  654 + int8_t tmp; \
  655 + tmp = (int8_t)src2; \
  656 + if (tmp < 0) { \
  657 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  658 + } else { \
  659 + dest = src1 << tmp; \
  660 + if ((dest >> tmp) != src1) { \
  661 + SET_QC(); \
  662 + dest = src1 >> 31; \
  663 + } \
  664 + }} while (0)
  665 +NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
  666 +NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
  667 +NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
  668 +#undef NEON_FN
  669 +
  670 +uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  671 +{
  672 + int8_t shift = (uint8_t)shiftop;
  673 + int64_t val = valop;
  674 +
  675 + if (shift < 0) {
  676 + val = (val + (1 << (-1 - shift))) >> -shift;
  677 + } else {
  678 + int64_t tmp = val;;
  679 + val <<= shift;
  680 + if ((val >> shift) != tmp) {
  681 + SET_QC();
  682 + val = tmp >> 31;
  683 + }
  684 + }
  685 + return val;
  686 +}
  687 +
  688 +uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
  689 +{
  690 + uint32_t mask;
  691 + mask = (a ^ b) & 0x80808080u;
  692 + a &= ~0x80808080u;
  693 + b &= ~0x80808080u;
  694 + return (a + b) ^ mask;
  695 +}
  696 +
  697 +uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
  698 +{
  699 + uint32_t mask;
  700 + mask = (a ^ b) & 0x80008000u;
  701 + a &= ~0x80008000u;
  702 + b &= ~0x80008000u;
  703 + return (a + b) ^ mask;
  704 +}
  705 +
  706 +#define NEON_FN(dest, src1, src2) dest = src1 + src2
  707 +NEON_POP(padd_u8, neon_u8, 4)
  708 +NEON_POP(padd_u16, neon_u16, 2)
  709 +#undef NEON_FN
  710 +
  711 +#define NEON_FN(dest, src1, src2) dest = src1 - src2
  712 +NEON_VOP(sub_u8, neon_u8, 4)
  713 +NEON_VOP(sub_u16, neon_u16, 2)
  714 +#undef NEON_FN
  715 +
  716 +#define NEON_FN(dest, src1, src2) dest = src1 * src2
  717 +NEON_VOP(mul_u8, neon_u8, 4)
  718 +NEON_VOP(mul_u16, neon_u16, 2)
  719 +#undef NEON_FN
  720 +
  721 +/* Polynomial multiplication is like integer multiplcation except the
  722 + partial products are XORed, not added. */
  723 +uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
  724 +{
  725 + uint32_t mask;
  726 + uint32_t result;
  727 + result = 0;
  728 + while (op1) {
  729 + mask = 0;
  730 + if (op1 & 1)
  731 + mask |= 0xff;
  732 + if (op1 & (1 << 8))
  733 + mask |= (0xff << 8);
  734 + if (op1 & (1 << 16))
  735 + mask |= (0xff << 16);
  736 + if (op1 & (1 << 24))
  737 + mask |= (0xff << 24);
  738 + result ^= op2 & mask;
  739 + op1 = (op1 >> 1) & 0x7f7f7f7f;
  740 + op2 = (op2 << 1) & 0xfefefefe;
  741 + }
  742 + return result;
  743 +}
  744 +
  745 +#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
  746 +NEON_VOP(tst_u8, neon_u8, 4)
  747 +NEON_VOP(tst_u16, neon_u16, 2)
  748 +NEON_VOP(tst_u32, neon_u32, 1)
  749 +#undef NEON_FN
  750 +
  751 +#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
  752 +NEON_VOP(ceq_u8, neon_u8, 4)
  753 +NEON_VOP(ceq_u16, neon_u16, 2)
  754 +NEON_VOP(ceq_u32, neon_u32, 1)
  755 +#undef NEON_FN
  756 +
  757 +#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
  758 +NEON_VOP1(abs_s8, neon_s8, 4)
  759 +NEON_VOP1(abs_s16, neon_s16, 2)
  760 +#undef NEON_FN
  761 +
  762 +/* Count Leading Sign/Zero Bits. */
  763 +static inline int do_clz8(uint8_t x)
  764 +{
  765 + int n;
  766 + for (n = 8; x; n--)
  767 + x >>= 1;
  768 + return n;
  769 +}
  770 +
  771 +static inline int do_clz16(uint16_t x)
  772 +{
  773 + int n;
  774 + for (n = 16; x; n--)
  775 + x >>= 1;
  776 + return n;
  777 +}
  778 +
  779 +#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
  780 +NEON_VOP1(clz_u8, neon_u8, 4)
  781 +#undef NEON_FN
  782 +
  783 +#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
  784 +NEON_VOP1(clz_u16, neon_u16, 2)
  785 +#undef NEON_FN
  786 +
  787 +#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
  788 +NEON_VOP1(cls_s8, neon_s8, 4)
  789 +#undef NEON_FN
  790 +
  791 +#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
  792 +NEON_VOP1(cls_s16, neon_s16, 2)
  793 +#undef NEON_FN
  794 +
  795 +uint32_t HELPER(neon_cls_s32)(uint32_t x)
  796 +{
  797 + int count;
  798 + if ((int32_t)x < 0)
  799 + x = ~x;
  800 + for (count = 32; x; count--)
  801 + x = x >> 1;
  802 + return count - 1;
  803 +}
  804 +
  805 +/* Bit count. */
  806 +uint32_t HELPER(neon_cnt_u8)(uint32_t x)
  807 +{
  808 + x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
  809 + x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  810 + x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
  811 + return x;
  812 +}
  813 +
  814 +#define NEON_QDMULH16(dest, src1, src2, round) do { \
  815 + uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
  816 + if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
  817 + SET_QC(); \
  818 + tmp = (tmp >> 31) ^ ~SIGNBIT; \
  819 + } \
  820 + tmp <<= 1; \
  821 + if (round) { \
  822 + int32_t old = tmp; \
  823 + tmp += 1 << 15; \
  824 + if ((int32_t)tmp < old) { \
  825 + SET_QC(); \
  826 + tmp = SIGNBIT - 1; \
  827 + } \
  828 + } \
  829 + dest = tmp >> 16; \
  830 + } while(0)
  831 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
  832 +NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
  833 +#undef NEON_FN
  834 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
  835 +NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
  836 +#undef NEON_FN
  837 +#undef NEON_QDMULH16
  838 +
  839 +#define NEON_QDMULH32(dest, src1, src2, round) do { \
  840 + uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
  841 + if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
  842 + SET_QC(); \
  843 + tmp = (tmp >> 63) ^ ~SIGNBIT64; \
  844 + } else { \
  845 + tmp <<= 1; \
  846 + } \
  847 + if (round) { \
  848 + int64_t old = tmp; \
  849 + tmp += (int64_t)1 << 31; \
  850 + if ((int64_t)tmp < old) { \
  851 + SET_QC(); \
  852 + tmp = SIGNBIT64 - 1; \
  853 + } \
  854 + } \
  855 + dest = tmp >> 32; \
  856 + } while(0)
  857 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
  858 +NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
  859 +#undef NEON_FN
  860 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
  861 +NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
  862 +#undef NEON_FN
  863 +#undef NEON_QDMULH32
  864 +
  865 +uint32_t HELPER(neon_narrow_u8)(uint64_t x)
  866 +{
  867 + return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
  868 + | ((x >> 24) & 0xff000000u);
  869 +}
  870 +
  871 +uint32_t HELPER(neon_narrow_u16)(uint64_t x)
  872 +{
  873 + return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
  874 +}
  875 +
  876 +uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
  877 +{
  878 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  879 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  880 +}
  881 +
  882 +uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
  883 +{
  884 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  885 +}
  886 +
  887 +uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
  888 +{
  889 + x &= 0xff80ff80ff80ff80ull;
  890 + x += 0x0080008000800080ull;
  891 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  892 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  893 +}
  894 +
  895 +uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
  896 +{
  897 + x &= 0xffff8000ffff8000ull;
  898 + x += 0x0000800000008000ull;
  899 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  900 +}
  901 +
  902 +uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
  903 +{
  904 + uint16_t s;
  905 + uint8_t d;
  906 + uint32_t res = 0;
  907 +#define SAT8(n) \
  908 + s = x >> n; \
  909 + if (s > 0xff) { \
  910 + d = 0xff; \
  911 + SET_QC(); \
  912 + } else { \
  913 + d = s; \
  914 + } \
  915 + res |= (uint32_t)d << (n / 2);
  916 +
  917 + SAT8(0);
  918 + SAT8(16);
  919 + SAT8(32);
  920 + SAT8(48);
  921 +#undef SAT8
  922 + return res;
  923 +}
  924 +
  925 +uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
  926 +{
  927 + int16_t s;
  928 + uint8_t d;
  929 + uint32_t res = 0;
  930 +#define SAT8(n) \
  931 + s = x >> n; \
  932 + if (s != (int8_t)s) { \
  933 + d = (s >> 15) ^ 0x7f; \
  934 + SET_QC(); \
  935 + } else { \
  936 + d = s; \
  937 + } \
  938 + res |= (uint32_t)d << (n / 2);
  939 +
  940 + SAT8(0);
  941 + SAT8(16);
  942 + SAT8(32);
  943 + SAT8(48);
  944 +#undef SAT8
  945 + return res;
  946 +}
  947 +
  948 +uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
  949 +{
  950 + uint32_t high;
  951 + uint32_t low;
  952 + low = x;
  953 + if (low > 0xffff) {
  954 + low = 0xffff;
  955 + SET_QC();
  956 + }
  957 + high = x >> 32;
  958 + if (high > 0xffff) {
  959 + high = 0xffff;
  960 + SET_QC();
  961 + }
  962 + return low | (high << 16);
  963 +}
  964 +
  965 +uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
  966 +{
  967 + int32_t low;
  968 + int32_t high;
  969 + low = x;
  970 + if (low != (int16_t)low) {
  971 + low = (low >> 31) ^ 0x7fff;
  972 + SET_QC();
  973 + }
  974 + high = x >> 32;
  975 + if (high != (int16_t)high) {
  976 + high = (high >> 31) ^ 0x7fff;
  977 + SET_QC();
  978 + }
  979 + return (uint16_t)low | (high << 16);
  980 +}
  981 +
  982 +uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
  983 +{
  984 + if (x > 0xffffffffu) {
  985 + SET_QC();
  986 + return 0xffffffffu;
  987 + }
  988 + return x;
  989 +}
  990 +
  991 +uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
  992 +{
  993 + if ((int64_t)x != (int32_t)x) {
  994 + SET_QC();
  995 + return (x >> 63) ^ 0x7fffffff;
  996 + }
  997 + return x;
  998 +}
  999 +
  1000 +uint64_t HELPER(neon_widen_u8)(uint32_t x)
  1001 +{
  1002 + uint64_t tmp;
  1003 + uint64_t ret;
  1004 + ret = (uint8_t)x;
  1005 + tmp = (uint8_t)(x >> 8);
  1006 + ret |= tmp << 16;
  1007 + tmp = (uint8_t)(x >> 16);
  1008 + ret |= tmp << 32;
  1009 + tmp = (uint8_t)(x >> 24);
  1010 + ret |= tmp << 48;
  1011 + return ret;
  1012 +}
  1013 +
  1014 +uint64_t HELPER(neon_widen_s8)(uint32_t x)
  1015 +{
  1016 + uint64_t tmp;
  1017 + uint64_t ret;
  1018 + ret = (uint16_t)(int8_t)x;
  1019 + tmp = (uint16_t)(int8_t)(x >> 8);
  1020 + ret |= tmp << 16;
  1021 + tmp = (uint16_t)(int8_t)(x >> 16);
  1022 + ret |= tmp << 32;
  1023 + tmp = (uint16_t)(int8_t)(x >> 24);
  1024 + ret |= tmp << 48;
  1025 + return ret;
  1026 +}
  1027 +
  1028 +uint64_t HELPER(neon_widen_u16)(uint32_t x)
  1029 +{
  1030 + uint64_t high = (uint16_t)(x >> 16);
  1031 + return ((uint16_t)x) | (high << 32);
  1032 +}
  1033 +
  1034 +uint64_t HELPER(neon_widen_s16)(uint32_t x)
  1035 +{
  1036 + uint64_t high = (int16_t)(x >> 16);
  1037 + return ((uint32_t)(int16_t)x) | (high << 32);
  1038 +}
  1039 +
  1040 +uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
  1041 +{
  1042 + uint64_t mask;
  1043 + mask = (a ^ b) & 0x8000800080008000ull;
  1044 + a &= ~0x8000800080008000ull;
  1045 + b &= ~0x8000800080008000ull;
  1046 + return (a + b) ^ mask;
  1047 +}
  1048 +
  1049 +uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
  1050 +{
  1051 + uint64_t mask;
  1052 + mask = (a ^ b) & 0x8000000080000000ull;
  1053 + a &= ~0x8000000080000000ull;
  1054 + b &= ~0x8000000080000000ull;
  1055 + return (a + b) ^ mask;
  1056 +}
  1057 +
  1058 +uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
  1059 +{
  1060 + uint64_t tmp;
  1061 + uint64_t tmp2;
  1062 +
  1063 + tmp = a & 0x0000ffff0000ffffull;
  1064 + tmp += (a >> 16) & 0x0000ffff0000ffffull;
  1065 + tmp2 = b & 0xffff0000ffff0000ull;
  1066 + tmp2 += (b << 16) & 0xffff0000ffff0000ull;
  1067 + return ( tmp & 0xffff)
  1068 + | ((tmp >> 16) & 0xffff0000ull)
  1069 + | ((tmp2 << 16) & 0xffff00000000ull)
  1070 + | ( tmp2 & 0xffff000000000000ull);
  1071 +}
  1072 +
  1073 +uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
  1074 +{
  1075 + uint32_t low = a + (a >> 32);
  1076 + uint32_t high = b + (b >> 32);
  1077 + return low + ((uint64_t)high << 32);
  1078 +}
  1079 +
  1080 +uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
  1081 +{
  1082 + uint64_t mask;
  1083 + mask = (a ^ ~b) & 0x8000800080008000ull;
  1084 + a |= 0x8000800080008000ull;
  1085 + b &= ~0x8000800080008000ull;
  1086 + return (a - b) ^ mask;
  1087 +}
  1088 +
  1089 +uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
  1090 +{
  1091 + uint64_t mask;
  1092 + mask = (a ^ ~b) & 0x8000000080000000ull;
  1093 + a |= 0x8000000080000000ull;
  1094 + b &= ~0x8000000080000000ull;
  1095 + return (a - b) ^ mask;
  1096 +}
  1097 +
  1098 +uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
  1099 +{
  1100 + uint32_t x, y;
  1101 + uint32_t low, high;
  1102 +
  1103 + x = a;
  1104 + y = b;
  1105 + low = x + y;
  1106 + if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1107 + SET_QC();
  1108 + low = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1109 + }
  1110 + x = a >> 32;
  1111 + y = b >> 32;
  1112 + high = x + y;
  1113 + if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1114 + SET_QC();
  1115 + high = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1116 + }
  1117 + return low | ((uint64_t)high << 32);
  1118 +}
  1119 +
  1120 +uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
  1121 +{
  1122 + uint64_t result;
  1123 +
  1124 + result = a + b;
  1125 + if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
  1126 + SET_QC();
  1127 + result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
  1128 + }
  1129 + return result;
  1130 +}
  1131 +
  1132 +#define DO_ABD(dest, x, y, type) do { \
  1133 + type tmp_x = x; \
  1134 + type tmp_y = y; \
  1135 + dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
  1136 + } while(0)
  1137 +
  1138 +uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
  1139 +{
  1140 + uint64_t tmp;
  1141 + uint64_t result;
  1142 + DO_ABD(result, a, b, uint8_t);
  1143 + DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
  1144 + result |= tmp << 16;
  1145 + DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
  1146 + result |= tmp << 32;
  1147 + DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
  1148 + result |= tmp << 48;
  1149 + return result;
  1150 +}
  1151 +
  1152 +uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
  1153 +{
  1154 + uint64_t tmp;
  1155 + uint64_t result;
  1156 + DO_ABD(result, a, b, int8_t);
  1157 + DO_ABD(tmp, a >> 8, b >> 8, int8_t);
  1158 + result |= tmp << 16;
  1159 + DO_ABD(tmp, a >> 16, b >> 16, int8_t);
  1160 + result |= tmp << 32;
  1161 + DO_ABD(tmp, a >> 24, b >> 24, int8_t);
  1162 + result |= tmp << 48;
  1163 + return result;
  1164 +}
  1165 +
  1166 +uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
  1167 +{
  1168 + uint64_t tmp;
  1169 + uint64_t result;
  1170 + DO_ABD(result, a, b, uint16_t);
  1171 + DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
  1172 + return result | (tmp << 32);
  1173 +}
  1174 +
  1175 +uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
  1176 +{
  1177 + uint64_t tmp;
  1178 + uint64_t result;
  1179 + DO_ABD(result, a, b, int16_t);
  1180 + DO_ABD(tmp, a >> 16, b >> 16, int16_t);
  1181 + return result | (tmp << 32);
  1182 +}
  1183 +
  1184 +uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
  1185 +{
  1186 + uint64_t result;
  1187 + DO_ABD(result, a, b, uint32_t);
  1188 + return result;
  1189 +}
  1190 +
  1191 +uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
  1192 +{
  1193 + uint64_t result;
  1194 + DO_ABD(result, a, b, int32_t);
  1195 + return result;
  1196 +}
  1197 +#undef DO_ABD
  1198 +
  1199 +/* Widening multiply. Named type is the source type. */
  1200 +#define DO_MULL(dest, x, y, type1, type2) do { \
  1201 + type1 tmp_x = x; \
  1202 + type1 tmp_y = y; \
  1203 + dest = (type2)((type2)tmp_x * (type2)tmp_y); \
  1204 + } while(0)
  1205 +
  1206 +uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
  1207 +{
  1208 + uint64_t tmp;
  1209 + uint64_t result;
  1210 +
  1211 + DO_MULL(result, a, b, uint8_t, uint16_t);
  1212 + DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
  1213 + result |= tmp << 16;
  1214 + DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
  1215 + result |= tmp << 32;
  1216 + DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
  1217 + result |= tmp << 48;
  1218 + return result;
  1219 +}
  1220 +
  1221 +uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
  1222 +{
  1223 + uint64_t tmp;
  1224 + uint64_t result;
  1225 +
  1226 + DO_MULL(result, a, b, int8_t, uint16_t);
  1227 + DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
  1228 + result |= tmp << 16;
  1229 + DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
  1230 + result |= tmp << 32;
  1231 + DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
  1232 + result |= tmp << 48;
  1233 + return result;
  1234 +}
  1235 +
  1236 +uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
  1237 +{
  1238 + uint64_t tmp;
  1239 + uint64_t result;
  1240 +
  1241 + DO_MULL(result, a, b, uint16_t, uint32_t);
  1242 + DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
  1243 + return result | (tmp << 32);
  1244 +}
  1245 +
  1246 +uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
  1247 +{
  1248 + uint64_t tmp;
  1249 + uint64_t result;
  1250 +
  1251 + DO_MULL(result, a, b, int16_t, uint32_t);
  1252 + DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
  1253 + return result | (tmp << 32);
  1254 +}
  1255 +
  1256 +uint64_t HELPER(neon_negl_u16)(uint64_t x)
  1257 +{
  1258 + uint16_t tmp;
  1259 + uint64_t result;
  1260 + result = (uint16_t)-x;
  1261 + tmp = -(x >> 16);
  1262 + result |= (uint64_t)tmp << 16;
  1263 + tmp = -(x >> 32);
  1264 + result |= (uint64_t)tmp << 32;
  1265 + tmp = -(x >> 48);
  1266 + result |= (uint64_t)tmp << 48;
  1267 + return result;
  1268 +}
  1269 +
  1270 +#include <stdio.h>
  1271 +uint64_t HELPER(neon_negl_u32)(uint64_t x)
  1272 +{
  1273 + uint32_t low = -x;
  1274 + uint32_t high = -(x >> 32);
  1275 + return low | ((uint64_t)high << 32);
  1276 +}
  1277 +
  1278 +/* FIXME: There should be a native op for this. */
  1279 +uint64_t HELPER(neon_negl_u64)(uint64_t x)
  1280 +{
  1281 + return -x;
  1282 +}
  1283 +
  1284 +/* Saturnating sign manuipulation. */
  1285 +/* ??? Make these use NEON_VOP1 */
  1286 +#define DO_QABS8(x) do { \
  1287 + if (x == (int8_t)0x80) { \
  1288 + x = 0x7f; \
  1289 + SET_QC(); \
  1290 + } else if (x < 0) { \
  1291 + x = -x; \
  1292 + }} while (0)
  1293 +uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
  1294 +{
  1295 + neon_s8 vec;
  1296 + NEON_UNPACK(neon_s8, vec, x);
  1297 + DO_QABS8(vec.v1);
  1298 + DO_QABS8(vec.v2);
  1299 + DO_QABS8(vec.v3);
  1300 + DO_QABS8(vec.v4);
  1301 + NEON_PACK(neon_s8, x, vec);
  1302 + return x;
  1303 +}
  1304 +#undef DO_QABS8
  1305 +
  1306 +#define DO_QNEG8(x) do { \
  1307 + if (x == (int8_t)0x80) { \
  1308 + x = 0x7f; \
  1309 + SET_QC(); \
  1310 + } else { \
  1311 + x = -x; \
  1312 + }} while (0)
  1313 +uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
  1314 +{
  1315 + neon_s8 vec;
  1316 + NEON_UNPACK(neon_s8, vec, x);
  1317 + DO_QNEG8(vec.v1);
  1318 + DO_QNEG8(vec.v2);
  1319 + DO_QNEG8(vec.v3);
  1320 + DO_QNEG8(vec.v4);
  1321 + NEON_PACK(neon_s8, x, vec);
  1322 + return x;
  1323 +}
  1324 +#undef DO_QNEG8
  1325 +
  1326 +#define DO_QABS16(x) do { \
  1327 + if (x == (int16_t)0x8000) { \
  1328 + x = 0x7fff; \
  1329 + SET_QC(); \
  1330 + } else if (x < 0) { \
  1331 + x = -x; \
  1332 + }} while (0)
  1333 +uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
  1334 +{
  1335 + neon_s16 vec;
  1336 + NEON_UNPACK(neon_s16, vec, x);
  1337 + DO_QABS16(vec.v1);
  1338 + DO_QABS16(vec.v2);
  1339 + NEON_PACK(neon_s16, x, vec);
  1340 + return x;
  1341 +}
  1342 +#undef DO_QABS16
  1343 +
  1344 +#define DO_QNEG16(x) do { \
  1345 + if (x == (int16_t)0x8000) { \
  1346 + x = 0x7fff; \
  1347 + SET_QC(); \
  1348 + } else { \
  1349 + x = -x; \
  1350 + }} while (0)
  1351 +uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
  1352 +{
  1353 + neon_s16 vec;
  1354 + NEON_UNPACK(neon_s16, vec, x);
  1355 + DO_QNEG16(vec.v1);
  1356 + DO_QNEG16(vec.v2);
  1357 + NEON_PACK(neon_s16, x, vec);
  1358 + return x;
  1359 +}
  1360 +#undef DO_QNEG16
  1361 +
  1362 +uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
  1363 +{
  1364 + if (x == SIGNBIT) {
  1365 + SET_QC();
  1366 + x = ~SIGNBIT;
  1367 + } else if ((int32_t)x < 0) {
  1368 + x = -x;
  1369 + }
  1370 + return x;
  1371 +}
  1372 +
  1373 +uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
  1374 +{
  1375 + if (x == SIGNBIT) {
  1376 + SET_QC();
  1377 + x = ~SIGNBIT;
  1378 + } else {
  1379 + x = -x;
  1380 + }
  1381 + return x;
  1382 +}
  1383 +
  1384 +/* NEON Float helpers. */
  1385 +uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
  1386 +{
  1387 + float32 f0 = vfp_itos(a);
  1388 + float32 f1 = vfp_itos(b);
  1389 + return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
  1390 +}
  1391 +
  1392 +uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
  1393 +{
  1394 + float32 f0 = vfp_itos(a);
  1395 + float32 f1 = vfp_itos(b);
  1396 + return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
  1397 +}
  1398 +
  1399 +uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
  1400 +{
  1401 + float32 f0 = vfp_itos(a);
  1402 + float32 f1 = vfp_itos(b);
  1403 + return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
  1404 + ? float32_sub(f0, f1, NFS)
  1405 + : float32_sub(f1, f0, NFS));
  1406 +}
  1407 +
  1408 +uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
  1409 +{
  1410 + return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
  1411 +}
  1412 +
  1413 +uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
  1414 +{
  1415 + return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
  1416 +}
  1417 +
  1418 +uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
  1419 +{
  1420 + return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
  1421 +}
  1422 +
  1423 +/* Floating point comparisons produce an integer result. */
  1424 +#define NEON_VOP_FCMP(name, cmp) \
  1425 +uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
  1426 +{ \
  1427 + if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
  1428 + return ~0; \
  1429 + else \
  1430 + return 0; \
  1431 +}
  1432 +
  1433 +NEON_VOP_FCMP(ceq_f32, ==)
  1434 +NEON_VOP_FCMP(cge_f32, >=)
  1435 +NEON_VOP_FCMP(cgt_f32, >)
  1436 +
  1437 +uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
  1438 +{
  1439 + float32 f0 = float32_abs(vfp_itos(a));
  1440 + float32 f1 = float32_abs(vfp_itos(b));
  1441 + return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
  1442 +}
  1443 +
  1444 +uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
  1445 +{
  1446 + float32 f0 = float32_abs(vfp_itos(a));
  1447 + float32 f1 = float32_abs(vfp_itos(b));
  1448 + return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
  1449 +}
... ...
target-arm/op.c
... ... @@ -32,7 +32,5 @@
32 32 #include "op_mem.h"
33 33 #endif
34 34  
35   -#include "op_neon.h"
36   -
37 35 /* iwMMXt support */
38 36 #include "op_iwmmxt.c"
... ...
target-arm/op_helper.c
... ... @@ -20,6 +20,9 @@
20 20 #include "exec.h"
21 21 #include "helpers.h"
22 22  
  23 +#define SIGNBIT (uint32_t)0x80000000
  24 +#define SIGNBIT64 ((uint64_t)1 << 63)
  25 +
23 26 void raise_exception(int tt)
24 27 {
25 28 env->exception_index = tt;
... ... @@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr)
116 119 }
117 120 #endif
118 121  
119   -#define SIGNBIT (uint32_t)0x80000000
  122 +/* FIXME: Pass an axplicit pointer to QF to CPUState, and move saturating
  123 + instructions into helper.c */
120 124 uint32_t HELPER(add_setq)(uint32_t a, uint32_t b)
121 125 {
122 126 uint32_t res = a + b;
... ... @@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i)
451 455 }
452 456 }
453 457  
  458 +uint64_t HELPER(neon_add_saturate_s64)(uint64_t src1, uint64_t src2)
  459 +{
  460 + uint64_t res;
  461 +
  462 + res = src1 + src2;
  463 + if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
  464 + env->QF = 1;
  465 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  466 + }
  467 + return res;
  468 +}
  469 +
  470 +uint64_t HELPER(neon_add_saturate_u64)(uint64_t src1, uint64_t src2)
  471 +{
  472 + uint64_t res;
  473 +
  474 + res = src1 + src2;
  475 + if (res < src1) {
  476 + env->QF = 1;
  477 + res = ~(uint64_t)0;
  478 + }
  479 + return res;
  480 +}
  481 +
  482 +uint64_t HELPER(neon_sub_saturate_s64)(uint64_t src1, uint64_t src2)
  483 +{
  484 + uint64_t res;
  485 +
  486 + res = src1 - src2;
  487 + if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
  488 + env->QF = 1;
  489 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  490 + }
  491 + return res;
  492 +}
  493 +
  494 +uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2)
  495 +{
  496 + uint64_t res;
  497 +
  498 + if (src1 < src2) {
  499 + env->QF = 1;
  500 + res = 0;
  501 + } else {
  502 + res = src1 - src2;
  503 + }
  504 + return res;
  505 +}
  506 +
  507 +/* These need to return a pair of value, so still use T0/T1. */
  508 +/* Transpose. Argument order is rather strange to avoid special casing
  509 + the tranlation code.
  510 + On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
  511 +void HELPER(neon_trn_u8)(void)
  512 +{
  513 + uint32_t rd;
  514 + uint32_t rm;
  515 + rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
  516 + rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
  517 + T0 = rd;
  518 + T1 = rm;
  519 + FORCE_RET();
  520 +}
  521 +
  522 +void HELPER(neon_trn_u16)(void)
  523 +{
  524 + uint32_t rd;
  525 + uint32_t rm;
  526 + rd = (T0 << 16) | (T1 & 0xffff);
  527 + rm = (T1 >> 16) | (T0 & 0xffff0000);
  528 + T0 = rd;
  529 + T1 = rm;
  530 + FORCE_RET();
  531 +}
  532 +
  533 +/* Worker routines for zip and unzip. */
  534 +void HELPER(neon_unzip_u8)(void)
  535 +{
  536 + uint32_t rd;
  537 + uint32_t rm;
  538 + rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
  539 + | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
  540 + rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
  541 + | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
  542 + T0 = rd;
  543 + T1 = rm;
  544 + FORCE_RET();
  545 +}
  546 +
  547 +void HELPER(neon_zip_u8)(void)
  548 +{
  549 + uint32_t rd;
  550 + uint32_t rm;
  551 + rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
  552 + | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
  553 + rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
  554 + | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
  555 + T0 = rd;
  556 + T1 = rm;
  557 + FORCE_RET();
  558 +}
  559 +
  560 +void HELPER(neon_zip_u16)(void)
  561 +{
  562 + uint32_t tmp;
  563 +
  564 + tmp = (T0 & 0xffff) | (T1 << 16);
  565 + T1 = (T1 & 0xffff0000) | (T0 >> 16);
  566 + T0 = tmp;
  567 + FORCE_RET();
  568 +}
... ...
target-arm/op_neon.h deleted 100644 โ†’ 0
1   -/*
2   - * ARM NEON vector operations.
3   - *
4   - * Copyright (c) 2007 CodeSourcery.
5   - * Written by Paul Brook
6   - *
7   - * This code is licenced under the GPL.
8   - */
9   -/* Note that for NEON an "l" prefix means it is a wide operation, unlike
10   - scalar arm ops where it means a word size operation. */
11   -
12   -#define SIGNBIT (uint32_t)0x80000000
13   -/* ??? NEON ops should probably have their own float status. */
14   -#define NFS &env->vfp.fp_status
15   -#define NEON_OP(name) void OPPROTO op_neon_##name (void)
16   -
17   -/* Helper routines to perform bitwise copies between float and int. */
18   -static inline float32 vfp_itos(uint32_t i)
19   -{
20   - union {
21   - uint32_t i;
22   - float32 s;
23   - } v;
24   -
25   - v.i = i;
26   - return v.s;
27   -}
28   -
29   -static inline uint32_t vfp_stoi(float32 s)
30   -{
31   - union {
32   - uint32_t i;
33   - float32 s;
34   - } v;
35   -
36   - v.s = s;
37   - return v.i;
38   -}
39   -
40   -NEON_OP(getreg_T0)
41   -{
42   - T0 = *(uint32_t *)((char *) env + PARAM1);
43   -}
44   -
45   -NEON_OP(getreg_T1)
46   -{
47   - T1 = *(uint32_t *)((char *) env + PARAM1);
48   -}
49   -
50   -NEON_OP(setreg_T0)
51   -{
52   - *(uint32_t *)((char *) env + PARAM1) = T0;
53   -}
54   -
55   -NEON_OP(setreg_T1)
56   -{
57   - *(uint32_t *)((char *) env + PARAM1) = T1;
58   -}
59   -
60   -#define NEON_TYPE1(name, type) \
61   -typedef struct \
62   -{ \
63   - type v1; \
64   -} neon_##name;
65   -#ifdef WORDS_BIGENDIAN
66   -#define NEON_TYPE2(name, type) \
67   -typedef struct \
68   -{ \
69   - type v2; \
70   - type v1; \
71   -} neon_##name;
72   -#define NEON_TYPE4(name, type) \
73   -typedef struct \
74   -{ \
75   - type v4; \
76   - type v3; \
77   - type v2; \
78   - type v1; \
79   -} neon_##name;
80   -#else
81   -#define NEON_TYPE2(name, type) \
82   -typedef struct \
83   -{ \
84   - type v1; \
85   - type v2; \
86   -} neon_##name;
87   -#define NEON_TYPE4(name, type) \
88   -typedef struct \
89   -{ \
90   - type v1; \
91   - type v2; \
92   - type v3; \
93   - type v4; \
94   -} neon_##name;
95   -#endif
96   -
97   -NEON_TYPE4(s8, int8_t)
98   -NEON_TYPE4(u8, uint8_t)
99   -NEON_TYPE2(s16, int16_t)
100   -NEON_TYPE2(u16, uint16_t)
101   -NEON_TYPE1(s32, int32_t)
102   -NEON_TYPE1(u32, uint32_t)
103   -#undef NEON_TYPE4
104   -#undef NEON_TYPE2
105   -#undef NEON_TYPE1
106   -
107   -/* Copy from a uint32_t to a vector structure type. */
108   -#define NEON_UNPACK(vtype, dest, val) do { \
109   - union { \
110   - vtype v; \
111   - uint32_t i; \
112   - } conv_u; \
113   - conv_u.i = (val); \
114   - dest = conv_u.v; \
115   - } while(0)
116   -
117   -/* Copy from a vector structure type to a uint32_t. */
118   -#define NEON_PACK(vtype, dest, val) do { \
119   - union { \
120   - vtype v; \
121   - uint32_t i; \
122   - } conv_u; \
123   - conv_u.v = (val); \
124   - dest = conv_u.i; \
125   - } while(0)
126   -
127   -#define NEON_DO1 \
128   - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
129   -#define NEON_DO2 \
130   - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
131   - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
132   -#define NEON_DO4 \
133   - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
134   - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
135   - NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
136   - NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
137   -
138   -#define NEON_VOP(name, vtype, n) \
139   -NEON_OP(name) \
140   -{ \
141   - vtype vsrc1; \
142   - vtype vsrc2; \
143   - vtype vdest; \
144   - NEON_UNPACK(vtype, vsrc1, T0); \
145   - NEON_UNPACK(vtype, vsrc2, T1); \
146   - NEON_DO##n; \
147   - NEON_PACK(vtype, T0, vdest); \
148   - FORCE_RET(); \
149   -}
150   -
151   -#define NEON_VOP1(name, vtype, n) \
152   -NEON_OP(name) \
153   -{ \
154   - vtype vsrc1; \
155   - vtype vdest; \
156   - NEON_UNPACK(vtype, vsrc1, T0); \
157   - NEON_DO##n; \
158   - NEON_PACK(vtype, T0, vdest); \
159   - FORCE_RET(); \
160   -}
161   -
162   -/* Pairwise operations. */
163   -/* For 32-bit elements each segment only contains a single element, so
164   - the elementwise and pairwise operations are the same. */
165   -#define NEON_PDO2 \
166   - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
167   - NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
168   -#define NEON_PDO4 \
169   - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
170   - NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
171   - NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
172   - NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
173   -
174   -#define NEON_POP(name, vtype, n) \
175   -NEON_OP(name) \
176   -{ \
177   - vtype vsrc1; \
178   - vtype vsrc2; \
179   - vtype vdest; \
180   - NEON_UNPACK(vtype, vsrc1, T0); \
181   - NEON_UNPACK(vtype, vsrc2, T1); \
182   - NEON_PDO##n; \
183   - NEON_PACK(vtype, T0, vdest); \
184   - FORCE_RET(); \
185   -}
186   -
187   -#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
188   -NEON_VOP(hadd_s8, neon_s8, 4)
189   -NEON_VOP(hadd_u8, neon_u8, 4)
190   -NEON_VOP(hadd_s16, neon_s16, 2)
191   -NEON_VOP(hadd_u16, neon_u16, 2)
192   -#undef NEON_FN
193   -
194   -NEON_OP(hadd_s32)
195   -{
196   - int32_t src1 = T0;
197   - int32_t src2 = T1;
198   - int32_t dest;
199   -
200   - dest = (src1 >> 1) + (src2 >> 1);
201   - if (src1 & src2 & 1)
202   - dest++;
203   - T0 = dest;
204   - FORCE_RET();
205   -}
206   -
207   -NEON_OP(hadd_u32)
208   -{
209   - uint32_t src1 = T0;
210   - uint32_t src2 = T1;
211   - uint32_t dest;
212   -
213   - dest = (src1 >> 1) + (src2 >> 1);
214   - if (src1 & src2 & 1)
215   - dest++;
216   - T0 = dest;
217   - FORCE_RET();
218   -}
219   -
220   -#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
221   -NEON_VOP(rhadd_s8, neon_s8, 4)
222   -NEON_VOP(rhadd_u8, neon_u8, 4)
223   -NEON_VOP(rhadd_s16, neon_s16, 2)
224   -NEON_VOP(rhadd_u16, neon_u16, 2)
225   -#undef NEON_FN
226   -
227   -NEON_OP(rhadd_s32)
228   -{
229   - int32_t src1 = T0;
230   - int32_t src2 = T1;
231   - int32_t dest;
232   -
233   - dest = (src1 >> 1) + (src2 >> 1);
234   - if ((src1 | src2) & 1)
235   - dest++;
236   - T0 = dest;
237   - FORCE_RET();
238   -}
239   -
240   -NEON_OP(rhadd_u32)
241   -{
242   - uint32_t src1 = T0;
243   - uint32_t src2 = T1;
244   - uint32_t dest;
245   -
246   - dest = (src1 >> 1) + (src2 >> 1);
247   - if ((src1 | src2) & 1)
248   - dest++;
249   - T0 = dest;
250   - FORCE_RET();
251   -}
252   -
253   -#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
254   -NEON_VOP(hsub_s8, neon_s8, 4)
255   -NEON_VOP(hsub_u8, neon_u8, 4)
256   -NEON_VOP(hsub_s16, neon_s16, 2)
257   -NEON_VOP(hsub_u16, neon_u16, 2)
258   -#undef NEON_FN
259   -
260   -NEON_OP(hsub_s32)
261   -{
262   - int32_t src1 = T0;
263   - int32_t src2 = T1;
264   - int32_t dest;
265   -
266   - dest = (src1 >> 1) - (src2 >> 1);
267   - if ((~src1) & src2 & 1)
268   - dest--;
269   - T0 = dest;
270   - FORCE_RET();
271   -}
272   -
273   -NEON_OP(hsub_u32)
274   -{
275   - uint32_t src1 = T0;
276   - uint32_t src2 = T1;
277   - uint32_t dest;
278   -
279   - dest = (src1 >> 1) - (src2 >> 1);
280   - if ((~src1) & src2 & 1)
281   - dest--;
282   - T0 = dest;
283   - FORCE_RET();
284   -}
285   -
286   -#define NEON_USAT(dest, src1, src2, type) do { \
287   - uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
288   - if (tmp != (type)tmp) { \
289   - env->QF = 1; \
290   - dest = ~0; \
291   - } else { \
292   - dest = tmp; \
293   - }} while(0)
294   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
295   -NEON_VOP(qadd_u8, neon_u8, 4)
296   -#undef NEON_FN
297   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
298   -NEON_VOP(qadd_u16, neon_u16, 2)
299   -#undef NEON_FN
300   -#undef NEON_USAT
301   -
302   -#define NEON_SSAT(dest, src1, src2, type) do { \
303   - int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
304   - if (tmp != (type)tmp) { \
305   - env->QF = 1; \
306   - if (src2 > 0) { \
307   - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
308   - } else { \
309   - tmp = 1 << (sizeof(type) * 8 - 1); \
310   - } \
311   - } \
312   - dest = tmp; \
313   - } while(0)
314   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
315   -NEON_VOP(qadd_s8, neon_s8, 4)
316   -#undef NEON_FN
317   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
318   -NEON_VOP(qadd_s16, neon_s16, 2)
319   -#undef NEON_FN
320   -#undef NEON_SSAT
321   -
322   -#define NEON_USAT(dest, src1, src2, type) do { \
323   - uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
324   - if (tmp != (type)tmp) { \
325   - env->QF = 1; \
326   - dest = 0; \
327   - } else { \
328   - dest = tmp; \
329   - }} while(0)
330   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
331   -NEON_VOP(qsub_u8, neon_u8, 4)
332   -#undef NEON_FN
333   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
334   -NEON_VOP(qsub_u16, neon_u16, 2)
335   -#undef NEON_FN
336   -#undef NEON_USAT
337   -
338   -#define NEON_SSAT(dest, src1, src2, type) do { \
339   - int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
340   - if (tmp != (type)tmp) { \
341   - env->QF = 1; \
342   - if (src2 < 0) { \
343   - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
344   - } else { \
345   - tmp = 1 << (sizeof(type) * 8 - 1); \
346   - } \
347   - } \
348   - dest = tmp; \
349   - } while(0)
350   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
351   -NEON_VOP(qsub_s8, neon_s8, 4)
352   -#undef NEON_FN
353   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
354   -NEON_VOP(qsub_s16, neon_s16, 2)
355   -#undef NEON_FN
356   -#undef NEON_SSAT
357   -
358   -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
359   -NEON_VOP(cgt_s8, neon_s8, 4)
360   -NEON_VOP(cgt_u8, neon_u8, 4)
361   -NEON_VOP(cgt_s16, neon_s16, 2)
362   -NEON_VOP(cgt_u16, neon_u16, 2)
363   -NEON_VOP(cgt_s32, neon_s32, 1)
364   -NEON_VOP(cgt_u32, neon_u32, 1)
365   -#undef NEON_FN
366   -
367   -#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
368   -NEON_VOP(cge_s8, neon_s8, 4)
369   -NEON_VOP(cge_u8, neon_u8, 4)
370   -NEON_VOP(cge_s16, neon_s16, 2)
371   -NEON_VOP(cge_u16, neon_u16, 2)
372   -NEON_VOP(cge_s32, neon_s32, 1)
373   -NEON_VOP(cge_u32, neon_u32, 1)
374   -#undef NEON_FN
375   -
376   -#define NEON_FN(dest, src1, src2) do { \
377   - int8_t tmp; \
378   - tmp = (int8_t)src2; \
379   - if (tmp < 0) { \
380   - dest = src1 >> -tmp; \
381   - } else { \
382   - dest = src1 << tmp; \
383   - }} while (0)
384   -NEON_VOP(shl_s8, neon_s8, 4)
385   -NEON_VOP(shl_u8, neon_u8, 4)
386   -NEON_VOP(shl_s16, neon_s16, 2)
387   -NEON_VOP(shl_u16, neon_u16, 2)
388   -NEON_VOP(shl_s32, neon_s32, 1)
389   -NEON_VOP(shl_u32, neon_u32, 1)
390   -#undef NEON_FN
391   -
392   -NEON_OP(shl_u64)
393   -{
394   - int8_t shift = env->vfp.scratch[0];
395   - uint64_t val = T0 | ((uint64_t)T1 << 32);
396   - if (shift < 0) {
397   - val >>= -shift;
398   - } else {
399   - val <<= shift;
400   - }
401   - T0 = val;
402   - T1 = val >> 32;
403   - FORCE_RET();
404   -}
405   -
406   -NEON_OP(shl_s64)
407   -{
408   - int8_t shift = env->vfp.scratch[0];
409   - int64_t val = T0 | ((uint64_t)T1 << 32);
410   - if (shift < 0) {
411   - val >>= -shift;
412   - } else {
413   - val <<= shift;
414   - }
415   - T0 = val;
416   - T1 = val >> 32;
417   - FORCE_RET();
418   -}
419   -
420   -#define NEON_FN(dest, src1, src2) do { \
421   - int8_t tmp; \
422   - tmp = (int8_t)src1; \
423   - if (tmp < 0) { \
424   - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
425   - } else { \
426   - dest = src2 << tmp; \
427   - }} while (0)
428   -
429   -NEON_VOP(rshl_s8, neon_s8, 4)
430   -NEON_VOP(rshl_u8, neon_u8, 4)
431   -NEON_VOP(rshl_s16, neon_s16, 2)
432   -NEON_VOP(rshl_u16, neon_u16, 2)
433   -NEON_VOP(rshl_s32, neon_s32, 1)
434   -NEON_VOP(rshl_u32, neon_u32, 1)
435   -#undef NEON_FN
436   -
437   -NEON_OP(rshl_u64)
438   -{
439   - int8_t shift = env->vfp.scratch[0];
440   - uint64_t val = T0 | ((uint64_t)T1 << 32);
441   - if (shift < 0) {
442   - val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
443   - val >>= -shift;
444   - } else {
445   - val <<= shift;
446   - }
447   - T0 = val;
448   - T1 = val >> 32;
449   - FORCE_RET();
450   -}
451   -
452   -NEON_OP(rshl_s64)
453   -{
454   - int8_t shift = env->vfp.scratch[0];
455   - int64_t val = T0 | ((uint64_t)T1 << 32);
456   - if (shift < 0) {
457   - val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
458   - } else {
459   - val <<= shift;
460   - }
461   - T0 = val;
462   - T1 = val >> 32;
463   - FORCE_RET();
464   -}
465   -
466   -#define NEON_FN(dest, src1, src2) do { \
467   - int8_t tmp; \
468   - tmp = (int8_t)src1; \
469   - if (tmp < 0) { \
470   - dest = src2 >> -tmp; \
471   - } else { \
472   - dest = src2 << tmp; \
473   - if ((dest >> tmp) != src2) { \
474   - env->QF = 1; \
475   - dest = ~0; \
476   - } \
477   - }} while (0)
478   -NEON_VOP(qshl_s8, neon_s8, 4)
479   -NEON_VOP(qshl_s16, neon_s16, 2)
480   -NEON_VOP(qshl_s32, neon_s32, 1)
481   -#undef NEON_FN
482   -
483   -NEON_OP(qshl_s64)
484   -{
485   - int8_t shift = env->vfp.scratch[0];
486   - int64_t val = T0 | ((uint64_t)T1 << 32);
487   - if (shift < 0) {
488   - val >>= -shift;
489   - } else {
490   - int64_t tmp = val;
491   - val <<= shift;
492   - if ((val >> shift) != tmp) {
493   - env->QF = 1;
494   - val = (tmp >> 63) ^ 0x7fffffffffffffffULL;
495   - }
496   - }
497   - T0 = val;
498   - T1 = val >> 32;
499   - FORCE_RET();
500   -}
501   -
502   -#define NEON_FN(dest, src1, src2) do { \
503   - int8_t tmp; \
504   - tmp = (int8_t)src1; \
505   - if (tmp < 0) { \
506   - dest = src2 >> -tmp; \
507   - } else { \
508   - dest = src2 << tmp; \
509   - if ((dest >> tmp) != src2) { \
510   - env->QF = 1; \
511   - dest = src2 >> 31; \
512   - } \
513   - }} while (0)
514   -NEON_VOP(qshl_u8, neon_u8, 4)
515   -NEON_VOP(qshl_u16, neon_u16, 2)
516   -NEON_VOP(qshl_u32, neon_u32, 1)
517   -#undef NEON_FN
518   -
519   -NEON_OP(qshl_u64)
520   -{
521   - int8_t shift = env->vfp.scratch[0];
522   - uint64_t val = T0 | ((uint64_t)T1 << 32);
523   - if (shift < 0) {
524   - val >>= -shift;
525   - } else {
526   - uint64_t tmp = val;
527   - val <<= shift;
528   - if ((val >> shift) != tmp) {
529   - env->QF = 1;
530   - val = ~(uint64_t)0;
531   - }
532   - }
533   - T0 = val;
534   - T1 = val >> 32;
535   - FORCE_RET();
536   -}
537   -
538   -#define NEON_FN(dest, src1, src2) do { \
539   - int8_t tmp; \
540   - tmp = (int8_t)src1; \
541   - if (tmp < 0) { \
542   - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
543   - } else { \
544   - dest = src2 << tmp; \
545   - if ((dest >> tmp) != src2) { \
546   - dest = ~0; \
547   - } \
548   - }} while (0)
549   -NEON_VOP(qrshl_s8, neon_s8, 4)
550   -NEON_VOP(qrshl_s16, neon_s16, 2)
551   -NEON_VOP(qrshl_s32, neon_s32, 1)
552   -#undef NEON_FN
553   -
554   -#define NEON_FN(dest, src1, src2) do { \
555   - int8_t tmp; \
556   - tmp = (int8_t)src1; \
557   - if (tmp < 0) { \
558   - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
559   - } else { \
560   - dest = src2 << tmp; \
561   - if ((dest >> tmp) != src2) { \
562   - env->QF = 1; \
563   - dest = src2 >> 31; \
564   - } \
565   - }} while (0)
566   -NEON_VOP(qrshl_u8, neon_u8, 4)
567   -NEON_VOP(qrshl_u16, neon_u16, 2)
568   -NEON_VOP(qrshl_u32, neon_u32, 1)
569   -#undef NEON_FN
570   -
571   -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
572   -NEON_VOP(max_s8, neon_s8, 4)
573   -NEON_VOP(max_u8, neon_u8, 4)
574   -NEON_VOP(max_s16, neon_s16, 2)
575   -NEON_VOP(max_u16, neon_u16, 2)
576   -NEON_VOP(max_s32, neon_s32, 1)
577   -NEON_VOP(max_u32, neon_u32, 1)
578   -NEON_POP(pmax_s8, neon_s8, 4)
579   -NEON_POP(pmax_u8, neon_u8, 4)
580   -NEON_POP(pmax_s16, neon_s16, 2)
581   -NEON_POP(pmax_u16, neon_u16, 2)
582   -#undef NEON_FN
583   -
584   -NEON_OP(max_f32)
585   -{
586   - float32 f0 = vfp_itos(T0);
587   - float32 f1 = vfp_itos(T1);
588   - T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;
589   - FORCE_RET();
590   -}
591   -
592   -#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
593   -NEON_VOP(min_s8, neon_s8, 4)
594   -NEON_VOP(min_u8, neon_u8, 4)
595   -NEON_VOP(min_s16, neon_s16, 2)
596   -NEON_VOP(min_u16, neon_u16, 2)
597   -NEON_VOP(min_s32, neon_s32, 1)
598   -NEON_VOP(min_u32, neon_u32, 1)
599   -NEON_POP(pmin_s8, neon_s8, 4)
600   -NEON_POP(pmin_u8, neon_u8, 4)
601   -NEON_POP(pmin_s16, neon_s16, 2)
602   -NEON_POP(pmin_u16, neon_u16, 2)
603   -#undef NEON_FN
604   -
605   -NEON_OP(min_f32)
606   -{
607   - float32 f0 = vfp_itos(T0);
608   - float32 f1 = vfp_itos(T1);
609   - T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;
610   - FORCE_RET();
611   -}
612   -
613   -#define NEON_FN(dest, src1, src2) \
614   - dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
615   -NEON_VOP(abd_s8, neon_s8, 4)
616   -NEON_VOP(abd_u8, neon_u8, 4)
617   -NEON_VOP(abd_s16, neon_s16, 2)
618   -NEON_VOP(abd_u16, neon_u16, 2)
619   -NEON_VOP(abd_s32, neon_s32, 1)
620   -NEON_VOP(abd_u32, neon_u32, 1)
621   -#undef NEON_FN
622   -
623   -NEON_OP(abd_f32)
624   -{
625   - float32 f0 = vfp_itos(T0);
626   - float32 f1 = vfp_itos(T1);
627   - T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
628   - ? float32_sub(f0, f1, NFS)
629   - : float32_sub(f1, f0, NFS));
630   - FORCE_RET();
631   -}
632   -
633   -#define NEON_FN(dest, src1, src2) dest = src1 + src2
634   -NEON_VOP(add_u8, neon_u8, 4)
635   -NEON_VOP(add_u16, neon_u16, 2)
636   -NEON_POP(padd_u8, neon_u8, 4)
637   -NEON_POP(padd_u16, neon_u16, 2)
638   -#undef NEON_FN
639   -
640   -NEON_OP(add_f32)
641   -{
642   - T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));
643   - FORCE_RET();
644   -}
645   -
646   -#define NEON_FN(dest, src1, src2) dest = src1 - src2
647   -NEON_VOP(sub_u8, neon_u8, 4)
648   -NEON_VOP(sub_u16, neon_u16, 2)
649   -#undef NEON_FN
650   -
651   -NEON_OP(sub_f32)
652   -{
653   - T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));
654   - FORCE_RET();
655   -}
656   -
657   -#define NEON_FN(dest, src1, src2) dest = src2 - src1
658   -NEON_VOP(rsb_u8, neon_u8, 4)
659   -NEON_VOP(rsb_u16, neon_u16, 2)
660   -#undef NEON_FN
661   -
662   -NEON_OP(rsb_f32)
663   -{
664   - T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));
665   - FORCE_RET();
666   -}
667   -
668   -#define NEON_FN(dest, src1, src2) dest = src1 * src2
669   -NEON_VOP(mul_u8, neon_u8, 4)
670   -NEON_VOP(mul_u16, neon_u16, 2)
671   -#undef NEON_FN
672   -
673   -NEON_OP(mul_f32)
674   -{
675   - T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));
676   - FORCE_RET();
677   -}
678   -
679   -NEON_OP(mul_p8)
680   -{
681   - T0 = helper_neon_mul_p8(T0, T1);
682   -}
683   -
684   -#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
685   -NEON_VOP(tst_u8, neon_u8, 4)
686   -NEON_VOP(tst_u16, neon_u16, 2)
687   -NEON_VOP(tst_u32, neon_u32, 1)
688   -#undef NEON_FN
689   -
690   -#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
691   -NEON_VOP(ceq_u8, neon_u8, 4)
692   -NEON_VOP(ceq_u16, neon_u16, 2)
693   -NEON_VOP(ceq_u32, neon_u32, 1)
694   -#undef NEON_FN
695   -
696   -#define NEON_QDMULH16(dest, src1, src2, round) do { \
697   - uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
698   - if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
699   - env->QF = 1; \
700   - tmp = (tmp >> 31) ^ ~SIGNBIT; \
701   - } \
702   - tmp <<= 1; \
703   - if (round) { \
704   - int32_t old = tmp; \
705   - tmp += 1 << 15; \
706   - if ((int32_t)tmp < old) { \
707   - env->QF = 1; \
708   - tmp = SIGNBIT - 1; \
709   - } \
710   - } \
711   - dest = tmp >> 16; \
712   - } while(0)
713   -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
714   -NEON_VOP(qdmulh_s16, neon_s16, 2)
715   -#undef NEON_FN
716   -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
717   -NEON_VOP(qrdmulh_s16, neon_s16, 2)
718   -#undef NEON_FN
719   -#undef NEON_QDMULH16
720   -
721   -#define SIGNBIT64 ((uint64_t)1 << 63)
722   -#define NEON_QDMULH32(dest, src1, src2, round) do { \
723   - uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
724   - if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
725   - env->QF = 1; \
726   - tmp = (tmp >> 63) ^ ~SIGNBIT64; \
727   - } else { \
728   - tmp <<= 1; \
729   - } \
730   - if (round) { \
731   - int64_t old = tmp; \
732   - tmp += (int64_t)1 << 31; \
733   - if ((int64_t)tmp < old) { \
734   - env->QF = 1; \
735   - tmp = SIGNBIT64 - 1; \
736   - } \
737   - } \
738   - dest = tmp >> 32; \
739   - } while(0)
740   -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
741   -NEON_VOP(qdmulh_s32, neon_s32, 1)
742   -#undef NEON_FN
743   -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
744   -NEON_VOP(qrdmulh_s32, neon_s32, 1)
745   -#undef NEON_FN
746   -#undef NEON_QDMULH32
747   -
748   -/* Floating point comparisons produce an integer result. */
749   -#define NEON_VOP_FCMP(name, cmp) \
750   -NEON_OP(name) \
751   -{ \
752   - if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \
753   - T0 = -1; \
754   - else \
755   - T0 = 0; \
756   - FORCE_RET(); \
757   -}
758   -
759   -NEON_VOP_FCMP(ceq_f32, ==)
760   -NEON_VOP_FCMP(cge_f32, >=)
761   -NEON_VOP_FCMP(cgt_f32, >)
762   -
763   -NEON_OP(acge_f32)
764   -{
765   - float32 f0 = float32_abs(vfp_itos(T0));
766   - float32 f1 = float32_abs(vfp_itos(T1));
767   - T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;
768   - FORCE_RET();
769   -}
770   -
771   -NEON_OP(acgt_f32)
772   -{
773   - float32 f0 = float32_abs(vfp_itos(T0));
774   - float32 f1 = float32_abs(vfp_itos(T1));
775   - T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;
776   - FORCE_RET();
777   -}
778   -
779   -/* Narrowing instructions. The named type is the destination type. */
780   -NEON_OP(narrow_u8)
781   -{
782   - T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
783   - | ((T1 << 16) & 0xff0000) | (T1 << 24);
784   - FORCE_RET();
785   -}
786   -
787   -NEON_OP(narrow_sat_u8)
788   -{
789   - neon_u16 src;
790   - neon_u8 dest;
791   -#define SAT8(d, s) \
792   - if (s > 0xff) { \
793   - d = 0xff; \
794   - env->QF = 1; \
795   - } else { \
796   - d = s; \
797   - }
798   -
799   - NEON_UNPACK(neon_u16, src, T0);
800   - SAT8(dest.v1, src.v1);
801   - SAT8(dest.v2, src.v2);
802   - NEON_UNPACK(neon_u16, src, T1);
803   - SAT8(dest.v3, src.v1);
804   - SAT8(dest.v4, src.v2);
805   - NEON_PACK(neon_u8, T0, dest);
806   - FORCE_RET();
807   -#undef SAT8
808   -}
809   -
810   -NEON_OP(narrow_sat_s8)
811   -{
812   - neon_s16 src;
813   - neon_s8 dest;
814   -#define SAT8(d, s) \
815   - if (s != (uint8_t)s) { \
816   - d = (s >> 15) ^ 0x7f; \
817   - env->QF = 1; \
818   - } else { \
819   - d = s; \
820   - }
821   -
822   - NEON_UNPACK(neon_s16, src, T0);
823   - SAT8(dest.v1, src.v1);
824   - SAT8(dest.v2, src.v2);
825   - NEON_UNPACK(neon_s16, src, T1);
826   - SAT8(dest.v3, src.v1);
827   - SAT8(dest.v4, src.v2);
828   - NEON_PACK(neon_s8, T0, dest);
829   - FORCE_RET();
830   -#undef SAT8
831   -}
832   -
833   -NEON_OP(narrow_u16)
834   -{
835   - T0 = (T0 & 0xffff) | (T1 << 16);
836   -}
837   -
838   -NEON_OP(narrow_sat_u16)
839   -{
840   - if (T0 > 0xffff) {
841   - T0 = 0xffff;
842   - env->QF = 1;
843   - }
844   - if (T1 > 0xffff) {
845   - T1 = 0xffff;
846   - env->QF = 1;
847   - }
848   - T0 |= T1 << 16;
849   - FORCE_RET();
850   -}
851   -
852   -NEON_OP(narrow_sat_s16)
853   -{
854   - if ((int32_t)T0 != (int16_t)T0) {
855   - T0 = ((int32_t)T0 >> 31) ^ 0x7fff;
856   - env->QF = 1;
857   - }
858   - if ((int32_t)T1 != (int16_t) T1) {
859   - T1 = ((int32_t)T1 >> 31) ^ 0x7fff;
860   - env->QF = 1;
861   - }
862   - T0 = (uint16_t)T0 | (T1 << 16);
863   - FORCE_RET();
864   -}
865   -
866   -NEON_OP(narrow_sat_u32)
867   -{
868   - if (T1) {
869   - T0 = 0xffffffffu;
870   - env->QF = 1;
871   - }
872   - FORCE_RET();
873   -}
874   -
875   -NEON_OP(narrow_sat_s32)
876   -{
877   - int32_t sign = (int32_t)T1 >> 31;
878   -
879   - if ((int32_t)T1 != sign) {
880   - T0 = sign ^ 0x7fffffff;
881   - env->QF = 1;
882   - }
883   - FORCE_RET();
884   -}
885   -
886   -/* Narrowing instructions. Named type is the narrow type. */
887   -NEON_OP(narrow_high_u8)
888   -{
889   - T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
890   - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
891   - FORCE_RET();
892   -}
893   -
894   -NEON_OP(narrow_high_u16)
895   -{
896   - T0 = (T0 >> 16) | (T1 & 0xffff0000);
897   - FORCE_RET();
898   -}
899   -
900   -NEON_OP(narrow_high_round_u8)
901   -{
902   - T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)
903   - | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);
904   - FORCE_RET();
905   -}
906   -
907   -NEON_OP(narrow_high_round_u16)
908   -{
909   - T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);
910   - FORCE_RET();
911   -}
912   -
913   -NEON_OP(narrow_high_round_u32)
914   -{
915   - if (T0 >= 0x80000000u)
916   - T0 = T1 + 1;
917   - else
918   - T0 = T1;
919   - FORCE_RET();
920   -}
921   -
922   -/* Widening instructions. Named type is source type. */
923   -NEON_OP(widen_s8)
924   -{
925   - uint32_t src;
926   -
927   - src = T0;
928   - T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);
929   - T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);
930   -}
931   -
932   -NEON_OP(widen_u8)
933   -{
934   - T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);
935   - T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);
936   -}
937   -
938   -NEON_OP(widen_s16)
939   -{
940   - int32_t src;
941   -
942   - src = T0;
943   - T0 = (int16_t)src;
944   - T1 = src >> 16;
945   -}
946   -
947   -NEON_OP(widen_u16)
948   -{
949   - T1 = T0 >> 16;
950   - T0 &= 0xffff;
951   -}
952   -
953   -NEON_OP(widen_s32)
954   -{
955   - T1 = (int32_t)T0 >> 31;
956   - FORCE_RET();
957   -}
958   -
959   -NEON_OP(widen_high_u8)
960   -{
961   - T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);
962   - T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);
963   -}
964   -
965   -NEON_OP(widen_high_u16)
966   -{
967   - T1 = T0 & 0xffff0000;
968   - T0 <<= 16;
969   -}
970   -
971   -/* Long operations. The type is the wide type. */
972   -NEON_OP(shll_u16)
973   -{
974   - int shift = PARAM1;
975   - uint32_t mask;
976   -
977   - mask = 0xffff >> (16 - shift);
978   - mask |= mask << 16;
979   - mask = ~mask;
980   -
981   - T0 = (T0 << shift) & mask;
982   - T1 = (T1 << shift) & mask;
983   - FORCE_RET();
984   -}
985   -
986   -NEON_OP(shll_u64)
987   -{
988   - int shift = PARAM1;
989   -
990   - T1 <<= shift;
991   - T1 |= T0 >> (32 - shift);
992   - T0 <<= shift;
993   - FORCE_RET();
994   -}
995   -
996   -NEON_OP(addl_u16)
997   -{
998   - uint32_t tmp;
999   - uint32_t high;
1000   -
1001   - tmp = env->vfp.scratch[0];
1002   - high = (T0 >> 16) + (tmp >> 16);
1003   - T0 = (uint16_t)(T0 + tmp);
1004   - T0 |= (high << 16);
1005   - tmp = env->vfp.scratch[1];
1006   - high = (T1 >> 16) + (tmp >> 16);
1007   - T1 = (uint16_t)(T1 + tmp);
1008   - T1 |= (high << 16);
1009   - FORCE_RET();
1010   -}
1011   -
1012   -NEON_OP(addl_u32)
1013   -{
1014   - T0 += env->vfp.scratch[0];
1015   - T1 += env->vfp.scratch[1];
1016   - FORCE_RET();
1017   -}
1018   -
1019   -NEON_OP(addl_u64)
1020   -{
1021   - uint64_t tmp;
1022   - tmp = T0 | ((uint64_t)T1 << 32);
1023   - tmp += env->vfp.scratch[0];
1024   - tmp += (uint64_t)env->vfp.scratch[1] << 32;
1025   - T0 = tmp;
1026   - T1 = tmp >> 32;
1027   - FORCE_RET();
1028   -}
1029   -
1030   -NEON_OP(subl_u16)
1031   -{
1032   - uint32_t tmp;
1033   - uint32_t high;
1034   -
1035   - tmp = env->vfp.scratch[0];
1036   - high = (T0 >> 16) - (tmp >> 16);
1037   - T0 = (uint16_t)(T0 - tmp);
1038   - T0 |= (high << 16);
1039   - tmp = env->vfp.scratch[1];
1040   - high = (T1 >> 16) - (tmp >> 16);
1041   - T1 = (uint16_t)(T1 - tmp);
1042   - T1 |= (high << 16);
1043   - FORCE_RET();
1044   -}
1045   -
1046   -NEON_OP(subl_u32)
1047   -{
1048   - T0 -= env->vfp.scratch[0];
1049   - T1 -= env->vfp.scratch[1];
1050   - FORCE_RET();
1051   -}
1052   -
1053   -NEON_OP(subl_u64)
1054   -{
1055   - uint64_t tmp;
1056   - tmp = T0 | ((uint64_t)T1 << 32);
1057   - tmp -= env->vfp.scratch[0];
1058   - tmp -= (uint64_t)env->vfp.scratch[1] << 32;
1059   - T0 = tmp;
1060   - T1 = tmp >> 32;
1061   - FORCE_RET();
1062   -}
1063   -
1064   -#define DO_ABD(dest, x, y, type) do { \
1065   - type tmp_x = x; \
1066   - type tmp_y = y; \
1067   - dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1068   - } while(0)
1069   -
1070   -NEON_OP(abdl_u16)
1071   -{
1072   - uint32_t tmp;
1073   - uint32_t low;
1074   - uint32_t high;
1075   -
1076   - DO_ABD(low, T0, T1, uint8_t);
1077   - DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);
1078   - low |= tmp << 16;
1079   - DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);
1080   - DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);
1081   - high |= tmp << 16;
1082   - T0 = low;
1083   - T1 = high;
1084   - FORCE_RET();
1085   -}
1086   -
1087   -NEON_OP(abdl_s16)
1088   -{
1089   - uint32_t tmp;
1090   - uint32_t low;
1091   - uint32_t high;
1092   -
1093   - DO_ABD(low, T0, T1, int8_t);
1094   - DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);
1095   - low |= tmp << 16;
1096   - DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);
1097   - DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);
1098   - high |= tmp << 16;
1099   - T0 = low;
1100   - T1 = high;
1101   - FORCE_RET();
1102   -}
1103   -
1104   -NEON_OP(abdl_u32)
1105   -{
1106   - uint32_t low;
1107   - uint32_t high;
1108   -
1109   - DO_ABD(low, T0, T1, uint16_t);
1110   - DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);
1111   - T0 = low;
1112   - T1 = high;
1113   - FORCE_RET();
1114   -}
1115   -
1116   -NEON_OP(abdl_s32)
1117   -{
1118   - uint32_t low;
1119   - uint32_t high;
1120   -
1121   - DO_ABD(low, T0, T1, int16_t);
1122   - DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);
1123   - T0 = low;
1124   - T1 = high;
1125   - FORCE_RET();
1126   -}
1127   -
1128   -NEON_OP(abdl_u64)
1129   -{
1130   - DO_ABD(T0, T0, T1, uint32_t);
1131   - T1 = 0;
1132   -}
1133   -
1134   -NEON_OP(abdl_s64)
1135   -{
1136   - DO_ABD(T0, T0, T1, int32_t);
1137   - T1 = 0;
1138   -}
1139   -#undef DO_ABD
1140   -
1141   -/* Widening multiple. Named type is the source type. */
1142   -#define DO_MULL(dest, x, y, type1, type2) do { \
1143   - type1 tmp_x = x; \
1144   - type1 tmp_y = y; \
1145   - dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1146   - } while(0)
1147   -
1148   -NEON_OP(mull_u8)
1149   -{
1150   - uint32_t tmp;
1151   - uint32_t low;
1152   - uint32_t high;
1153   -
1154   - DO_MULL(low, T0, T1, uint8_t, uint16_t);
1155   - DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);
1156   - low |= tmp << 16;
1157   - DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);
1158   - DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);
1159   - high |= tmp << 16;
1160   - T0 = low;
1161   - T1 = high;
1162   - FORCE_RET();
1163   -}
1164   -
1165   -NEON_OP(mull_s8)
1166   -{
1167   - uint32_t tmp;
1168   - uint32_t low;
1169   - uint32_t high;
1170   -
1171   - DO_MULL(low, T0, T1, int8_t, uint16_t);
1172   - DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);
1173   - low |= tmp << 16;
1174   - DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);
1175   - DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);
1176   - high |= tmp << 16;
1177   - T0 = low;
1178   - T1 = high;
1179   - FORCE_RET();
1180   -}
1181   -
1182   -NEON_OP(mull_u16)
1183   -{
1184   - uint32_t low;
1185   - uint32_t high;
1186   -
1187   - DO_MULL(low, T0, T1, uint16_t, uint32_t);
1188   - DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);
1189   - T0 = low;
1190   - T1 = high;
1191   - FORCE_RET();
1192   -}
1193   -
1194   -NEON_OP(mull_s16)
1195   -{
1196   - uint32_t low;
1197   - uint32_t high;
1198   -
1199   - DO_MULL(low, T0, T1, int16_t, uint32_t);
1200   - DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);
1201   - T0 = low;
1202   - T1 = high;
1203   - FORCE_RET();
1204   -}
1205   -
1206   -NEON_OP(addl_saturate_s32)
1207   -{
1208   - uint32_t tmp;
1209   - uint32_t res;
1210   -
1211   - tmp = env->vfp.scratch[0];
1212   - res = T0 + tmp;
1213   - if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {
1214   - env->QF = 1;
1215   - T0 = (T0 >> 31) ^ 0x7fffffff;
1216   - } else {
1217   - T0 = res;
1218   - }
1219   - tmp = env->vfp.scratch[1];
1220   - res = T1 + tmp;
1221   - if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {
1222   - env->QF = 1;
1223   - T1 = (T1 >> 31) ^ 0x7fffffff;
1224   - } else {
1225   - T1 = res;
1226   - }
1227   - FORCE_RET();
1228   -}
1229   -
1230   -NEON_OP(addl_saturate_s64)
1231   -{
1232   - uint64_t src1;
1233   - uint64_t src2;
1234   - uint64_t res;
1235   -
1236   - src1 = T0 + ((uint64_t)T1 << 32);
1237   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1238   - res = src1 + src2;
1239   - if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
1240   - env->QF = 1;
1241   - T0 = ~(int64_t)src1 >> 63;
1242   - T1 = T0 ^ 0x80000000;
1243   - } else {
1244   - T0 = res;
1245   - T1 = res >> 32;
1246   - }
1247   - FORCE_RET();
1248   -}
1249   -
1250   -NEON_OP(addl_saturate_u64)
1251   -{
1252   - uint64_t src1;
1253   - uint64_t src2;
1254   - uint64_t res;
1255   -
1256   - src1 = T0 + ((uint64_t)T1 << 32);
1257   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1258   - res = src1 + src2;
1259   - if (res < src1) {
1260   - env->QF = 1;
1261   - T0 = 0xffffffff;
1262   - T1 = 0xffffffff;
1263   - } else {
1264   - T0 = res;
1265   - T1 = res >> 32;
1266   - }
1267   - FORCE_RET();
1268   -}
1269   -
1270   -NEON_OP(subl_saturate_s64)
1271   -{
1272   - uint64_t src1;
1273   - uint64_t src2;
1274   - uint64_t res;
1275   -
1276   - src1 = T0 + ((uint64_t)T1 << 32);
1277   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1278   - res = src1 - src2;
1279   - if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
1280   - env->QF = 1;
1281   - T0 = ~(int64_t)src1 >> 63;
1282   - T1 = T0 ^ 0x80000000;
1283   - } else {
1284   - T0 = res;
1285   - T1 = res >> 32;
1286   - }
1287   - FORCE_RET();
1288   -}
1289   -
1290   -NEON_OP(subl_saturate_u64)
1291   -{
1292   - uint64_t src1;
1293   - uint64_t src2;
1294   - uint64_t res;
1295   -
1296   - src1 = T0 + ((uint64_t)T1 << 32);
1297   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1298   - if (src1 < src2) {
1299   - env->QF = 1;
1300   - T0 = 0;
1301   - T1 = 0;
1302   - } else {
1303   - res = src1 - src2;
1304   - T0 = res;
1305   - T1 = res >> 32;
1306   - }
1307   - FORCE_RET();
1308   -}
1309   -
1310   -NEON_OP(negl_u16)
1311   -{
1312   - uint32_t tmp;
1313   - tmp = T0 >> 16;
1314   - tmp = -tmp;
1315   - T0 = (-T0 & 0xffff) | (tmp << 16);
1316   - tmp = T1 >> 16;
1317   - tmp = -tmp;
1318   - T1 = (-T1 & 0xffff) | (tmp << 16);
1319   - FORCE_RET();
1320   -}
1321   -
1322   -NEON_OP(negl_u32)
1323   -{
1324   - T0 = -T0;
1325   - T1 = -T1;
1326   - FORCE_RET();
1327   -}
1328   -
1329   -NEON_OP(negl_u64)
1330   -{
1331   - uint64_t val;
1332   -
1333   - val = T0 | ((uint64_t)T1 << 32);
1334   - val = -val;
1335   - T0 = val;
1336   - T1 = val >> 32;
1337   - FORCE_RET();
1338   -}
1339   -
1340   -/* Scalar operations. */
1341   -NEON_OP(dup_low16)
1342   -{
1343   - T0 = (T0 & 0xffff) | (T0 << 16);
1344   - FORCE_RET();
1345   -}
1346   -
1347   -NEON_OP(dup_high16)
1348   -{
1349   - T0 = (T0 >> 16) | (T0 & 0xffff0000);
1350   - FORCE_RET();
1351   -}
1352   -
1353   -/* Helper for VEXT */
1354   -NEON_OP(extract)
1355   -{
1356   - int shift = PARAM1;
1357   - T0 = (T0 >> shift) | (T1 << (32 - shift));
1358   - FORCE_RET();
1359   -}
1360   -
1361   -/* Pairwise add long. Named type is source type. */
1362   -NEON_OP(paddl_s8)
1363   -{
1364   - int8_t src1;
1365   - int8_t src2;
1366   - uint16_t result;
1367   - src1 = T0 >> 24;
1368   - src2 = T0 >> 16;
1369   - result = (uint16_t)src1 + src2;
1370   - src1 = T0 >> 8;
1371   - src2 = T0;
1372   - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1373   - FORCE_RET();
1374   -}
1375   -
1376   -NEON_OP(paddl_u8)
1377   -{
1378   - uint8_t src1;
1379   - uint8_t src2;
1380   - uint16_t result;
1381   - src1 = T0 >> 24;
1382   - src2 = T0 >> 16;
1383   - result = (uint16_t)src1 + src2;
1384   - src1 = T0 >> 8;
1385   - src2 = T0;
1386   - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1387   - FORCE_RET();
1388   -}
1389   -
1390   -NEON_OP(paddl_s16)
1391   -{
1392   - T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);
1393   - FORCE_RET();
1394   -}
1395   -
1396   -NEON_OP(paddl_u16)
1397   -{
1398   - T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);
1399   - FORCE_RET();
1400   -}
1401   -
1402   -NEON_OP(paddl_s32)
1403   -{
1404   - int64_t tmp;
1405   - tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;
1406   - T0 = tmp;
1407   - T1 = tmp >> 32;
1408   - FORCE_RET();
1409   -}
1410   -
1411   -NEON_OP(paddl_u32)
1412   -{
1413   - uint64_t tmp;
1414   - tmp = (uint64_t)T0 + (uint64_t)T1;
1415   - T0 = tmp;
1416   - T1 = tmp >> 32;
1417   - FORCE_RET();
1418   -}
1419   -
1420   -/* Count Leading Sign/Zero Bits. */
1421   -static inline int do_clz8(uint8_t x)
1422   -{
1423   - int n;
1424   - for (n = 8; x; n--)
1425   - x >>= 1;
1426   - return n;
1427   -}
1428   -
1429   -static inline int do_clz16(uint16_t x)
1430   -{
1431   - int n;
1432   - for (n = 16; x; n--)
1433   - x >>= 1;
1434   - return n;
1435   -}
1436   -
1437   -NEON_OP(clz_u8)
1438   -{
1439   - uint32_t result;
1440   - uint32_t tmp;
1441   -
1442   - tmp = T0;
1443   - result = do_clz8(tmp);
1444   - result |= do_clz8(tmp >> 8) << 8;
1445   - result |= do_clz8(tmp >> 16) << 16;
1446   - result |= do_clz8(tmp >> 24) << 24;
1447   - T0 = result;
1448   - FORCE_RET();
1449   -}
1450   -
1451   -NEON_OP(clz_u16)
1452   -{
1453   - uint32_t result;
1454   - uint32_t tmp;
1455   - tmp = T0;
1456   - result = do_clz16(tmp);
1457   - result |= do_clz16(tmp >> 16) << 16;
1458   - T0 = result;
1459   - FORCE_RET();
1460   -}
1461   -
1462   -NEON_OP(cls_s8)
1463   -{
1464   - uint32_t result;
1465   - int8_t tmp;
1466   - tmp = T0;
1467   - result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;
1468   - tmp = T0 >> 8;
1469   - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;
1470   - tmp = T0 >> 16;
1471   - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1472   - tmp = T0 >> 24;
1473   - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;
1474   - T0 = result;
1475   - FORCE_RET();
1476   -}
1477   -
1478   -NEON_OP(cls_s16)
1479   -{
1480   - uint32_t result;
1481   - int16_t tmp;
1482   - tmp = T0;
1483   - result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;
1484   - tmp = T0 >> 16;
1485   - result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1486   - T0 = result;
1487   - FORCE_RET();
1488   -}
1489   -
1490   -NEON_OP(cls_s32)
1491   -{
1492   - int count;
1493   - if ((int32_t)T0 < 0)
1494   - T0 = ~T0;
1495   - for (count = 32; T0 > 0; count--)
1496   - T0 = T0 >> 1;
1497   - T0 = count - 1;
1498   - FORCE_RET();
1499   -}
1500   -
1501   -/* Bit count. */
1502   -NEON_OP(cnt_u8)
1503   -{
1504   - T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555);
1505   - T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333);
1506   - T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f);
1507   - FORCE_RET();
1508   -}
1509   -
1510   -/* Saturnating negation. */
1511   -/* ??? Make these use NEON_VOP1 */
1512   -#define DO_QABS8(x) do { \
1513   - if (x == (int8_t)0x80) { \
1514   - x = 0x7f; \
1515   - env->QF = 1; \
1516   - } else if (x < 0) { \
1517   - x = -x; \
1518   - }} while (0)
1519   -NEON_OP(qabs_s8)
1520   -{
1521   - neon_s8 vec;
1522   - NEON_UNPACK(neon_s8, vec, T0);
1523   - DO_QABS8(vec.v1);
1524   - DO_QABS8(vec.v2);
1525   - DO_QABS8(vec.v3);
1526   - DO_QABS8(vec.v4);
1527   - NEON_PACK(neon_s8, T0, vec);
1528   - FORCE_RET();
1529   -}
1530   -#undef DO_QABS8
1531   -
1532   -#define DO_QNEG8(x) do { \
1533   - if (x == (int8_t)0x80) { \
1534   - x = 0x7f; \
1535   - env->QF = 1; \
1536   - } else { \
1537   - x = -x; \
1538   - }} while (0)
1539   -NEON_OP(qneg_s8)
1540   -{
1541   - neon_s8 vec;
1542   - NEON_UNPACK(neon_s8, vec, T0);
1543   - DO_QNEG8(vec.v1);
1544   - DO_QNEG8(vec.v2);
1545   - DO_QNEG8(vec.v3);
1546   - DO_QNEG8(vec.v4);
1547   - NEON_PACK(neon_s8, T0, vec);
1548   - FORCE_RET();
1549   -}
1550   -#undef DO_QNEG8
1551   -
1552   -#define DO_QABS16(x) do { \
1553   - if (x == (int16_t)0x8000) { \
1554   - x = 0x7fff; \
1555   - env->QF = 1; \
1556   - } else if (x < 0) { \
1557   - x = -x; \
1558   - }} while (0)
1559   -NEON_OP(qabs_s16)
1560   -{
1561   - neon_s16 vec;
1562   - NEON_UNPACK(neon_s16, vec, T0);
1563   - DO_QABS16(vec.v1);
1564   - DO_QABS16(vec.v2);
1565   - NEON_PACK(neon_s16, T0, vec);
1566   - FORCE_RET();
1567   -}
1568   -#undef DO_QABS16
1569   -
1570   -#define DO_QNEG16(x) do { \
1571   - if (x == (int16_t)0x8000) { \
1572   - x = 0x7fff; \
1573   - env->QF = 1; \
1574   - } else { \
1575   - x = -x; \
1576   - }} while (0)
1577   -NEON_OP(qneg_s16)
1578   -{
1579   - neon_s16 vec;
1580   - NEON_UNPACK(neon_s16, vec, T0);
1581   - DO_QNEG16(vec.v1);
1582   - DO_QNEG16(vec.v2);
1583   - NEON_PACK(neon_s16, T0, vec);
1584   - FORCE_RET();
1585   -}
1586   -#undef DO_QNEG16
1587   -
1588   -NEON_OP(qabs_s32)
1589   -{
1590   - if (T0 == 0x80000000) {
1591   - T0 = 0x7fffffff;
1592   - env->QF = 1;
1593   - } else if ((int32_t)T0 < 0) {
1594   - T0 = -T0;
1595   - }
1596   - FORCE_RET();
1597   -}
1598   -
1599   -NEON_OP(qneg_s32)
1600   -{
1601   - if (T0 == 0x80000000) {
1602   - T0 = 0x7fffffff;
1603   - env->QF = 1;
1604   - } else {
1605   - T0 = -T0;
1606   - }
1607   - FORCE_RET();
1608   -}
1609   -
1610   -/* Unary opperations */
1611   -#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1612   -NEON_VOP1(abs_s8, neon_s8, 4)
1613   -NEON_VOP1(abs_s16, neon_s16, 2)
1614   -NEON_OP(abs_s32)
1615   -{
1616   - if ((int32_t)T0 < 0)
1617   - T0 = -T0;
1618   - FORCE_RET();
1619   -}
1620   -#undef NEON_FN
1621   -
1622   -/* Transpose. Argument order is rather strange to avoid special casing
1623   - the tranlation code.
1624   - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
1625   -NEON_OP(trn_u8)
1626   -{
1627   - uint32_t rd;
1628   - uint32_t rm;
1629   - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
1630   - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
1631   - T0 = rd;
1632   - T1 = rm;
1633   - FORCE_RET();
1634   -}
1635   -
1636   -NEON_OP(trn_u16)
1637   -{
1638   - uint32_t rd;
1639   - uint32_t rm;
1640   - rd = (T0 << 16) | (T1 & 0xffff);
1641   - rm = (T1 >> 16) | (T0 & 0xffff0000);
1642   - T0 = rd;
1643   - T1 = rm;
1644   - FORCE_RET();
1645   -}
1646   -
1647   -/* Worker routines for zip and unzip. */
1648   -NEON_OP(unzip_u8)
1649   -{
1650   - uint32_t rd;
1651   - uint32_t rm;
1652   - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
1653   - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
1654   - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
1655   - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
1656   - T0 = rd;
1657   - T1 = rm;
1658   - FORCE_RET();
1659   -}
1660   -
1661   -NEON_OP(zip_u8)
1662   -{
1663   - uint32_t rd;
1664   - uint32_t rm;
1665   - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
1666   - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
1667   - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
1668   - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
1669   - T0 = rd;
1670   - T1 = rm;
1671   - FORCE_RET();
1672   -}
1673   -
1674   -NEON_OP(zip_u16)
1675   -{
1676   - uint32_t tmp;
1677   -
1678   - tmp = (T0 & 0xffff) | (T1 << 16);
1679   - T1 = (T1 & 0xffff0000) | (T0 >> 16);
1680   - T0 = tmp;
1681   - FORCE_RET();
1682   -}
1683   -
1684   -NEON_OP(dup_u8)
1685   -{
1686   - T0 = (T0 >> PARAM1) & 0xff;
1687   - T0 |= T0 << 8;
1688   - T0 |= T0 << 16;
1689   - FORCE_RET();
1690   -}