Commit ad69471ce5e1284e1cacd053bb0fe8d6175a2f9e

Authored by pbrook
1 parent 8f8e3aa4

ARM TCG conversion 14/16.

git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4151 c046a42c-6fe2-441c-8c8c-71466251a162
Makefile.target
... ... @@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o
211 211 endif
212 212  
213 213 ifeq ($(TARGET_BASE_ARCH), arm)
214   -LIBOBJS+= op_helper.o helper.o
  214 +LIBOBJS+= op_helper.o helper.o neon_helper.o
215 215 endif
216 216  
217 217 ifeq ($(TARGET_BASE_ARCH), sh4)
... ...
target-arm/helper.c
... ... @@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env)
256 256 free(env);
257 257 }
258 258  
259   -/* Polynomial multiplication is like integer multiplcation except the
260   - partial products are XORed, not added. */
261   -uint32_t helper_neon_mul_p8(uint32_t op1, uint32_t op2)
262   -{
263   - uint32_t mask;
264   - uint32_t result;
265   - result = 0;
266   - while (op1) {
267   - mask = 0;
268   - if (op1 & 1)
269   - mask |= 0xff;
270   - if (op1 & (1 << 8))
271   - mask |= (0xff << 8);
272   - if (op1 & (1 << 16))
273   - mask |= (0xff << 16);
274   - if (op1 & (1 << 24))
275   - mask |= (0xff << 24);
276   - result ^= op2 & mask;
277   - op1 = (op1 >> 1) & 0x7f7f7f7f;
278   - op2 = (op2 << 1) & 0xfefefefe;
279   - }
280   - return result;
281   -}
282   -
283 259 uint32_t cpsr_read(CPUARMState *env)
284 260 {
285 261 int ZF;
... ... @@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x)
376 352 return x;
377 353 }
378 354  
  355 +uint32_t HELPER(abs)(uint32_t x)
  356 +{
  357 + return ((int32_t)x < 0) ? -x : x;
  358 +}
  359 +
379 360 #if defined(CONFIG_USER_ONLY)
380 361  
381 362 void do_interrupt (CPUState *env)
... ...
target-arm/helpers.h
... ... @@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t))
84 84 DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t))
85 85 DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t))
86 86 DEF_HELPER_1_1(rbit, uint32_t, (uint32_t))
  87 +DEF_HELPER_1_1(abs, uint32_t, (uint32_t))
87 88  
88 89 #define PAS_OP(pfx) \
89 90 DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
... ... @@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *))
208 209 DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *))
209 210 DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *))
210 211 DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t))
  212 +DEF_HELPER_1_2(neon_add_saturate_u64, uint64_t, (uint64_t, uint64_t))
  213 +DEF_HELPER_1_2(neon_add_saturate_s64, uint64_t, (uint64_t, uint64_t))
  214 +DEF_HELPER_1_2(neon_sub_saturate_u64, uint64_t, (uint64_t, uint64_t))
  215 +DEF_HELPER_1_2(neon_sub_saturate_s64, uint64_t, (uint64_t, uint64_t))
211 216  
212 217 DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t))
213 218 DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t))
... ... @@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t))
223 228 DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t))
224 229 DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t))
225 230  
  231 +/* neon_helper.c */
  232 +DEF_HELPER_1_3(neon_qadd_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  233 +DEF_HELPER_1_3(neon_qadd_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  234 +DEF_HELPER_1_3(neon_qadd_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  235 +DEF_HELPER_1_3(neon_qadd_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  236 +DEF_HELPER_1_3(neon_qsub_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  237 +DEF_HELPER_1_3(neon_qsub_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  238 +DEF_HELPER_1_3(neon_qsub_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  239 +DEF_HELPER_1_3(neon_qsub_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  240 +
  241 +DEF_HELPER_1_2(neon_hadd_s8, uint32_t, (uint32_t, uint32_t))
  242 +DEF_HELPER_1_2(neon_hadd_u8, uint32_t, (uint32_t, uint32_t))
  243 +DEF_HELPER_1_2(neon_hadd_s16, uint32_t, (uint32_t, uint32_t))
  244 +DEF_HELPER_1_2(neon_hadd_u16, uint32_t, (uint32_t, uint32_t))
  245 +DEF_HELPER_1_2(neon_hadd_s32, int32_t, (int32_t, int32_t))
  246 +DEF_HELPER_1_2(neon_hadd_u32, uint32_t, (uint32_t, uint32_t))
  247 +DEF_HELPER_1_2(neon_rhadd_s8, uint32_t, (uint32_t, uint32_t))
  248 +DEF_HELPER_1_2(neon_rhadd_u8, uint32_t, (uint32_t, uint32_t))
  249 +DEF_HELPER_1_2(neon_rhadd_s16, uint32_t, (uint32_t, uint32_t))
  250 +DEF_HELPER_1_2(neon_rhadd_u16, uint32_t, (uint32_t, uint32_t))
  251 +DEF_HELPER_1_2(neon_rhadd_s32, int32_t, (int32_t, int32_t))
  252 +DEF_HELPER_1_2(neon_rhadd_u32, uint32_t, (uint32_t, uint32_t))
  253 +DEF_HELPER_1_2(neon_hsub_s8, uint32_t, (uint32_t, uint32_t))
  254 +DEF_HELPER_1_2(neon_hsub_u8, uint32_t, (uint32_t, uint32_t))
  255 +DEF_HELPER_1_2(neon_hsub_s16, uint32_t, (uint32_t, uint32_t))
  256 +DEF_HELPER_1_2(neon_hsub_u16, uint32_t, (uint32_t, uint32_t))
  257 +DEF_HELPER_1_2(neon_hsub_s32, int32_t, (int32_t, int32_t))
  258 +DEF_HELPER_1_2(neon_hsub_u32, uint32_t, (uint32_t, uint32_t))
  259 +
  260 +DEF_HELPER_1_2(neon_cgt_u8, uint32_t, (uint32_t, uint32_t))
  261 +DEF_HELPER_1_2(neon_cgt_s8, uint32_t, (uint32_t, uint32_t))
  262 +DEF_HELPER_1_2(neon_cgt_u16, uint32_t, (uint32_t, uint32_t))
  263 +DEF_HELPER_1_2(neon_cgt_s16, uint32_t, (uint32_t, uint32_t))
  264 +DEF_HELPER_1_2(neon_cgt_u32, uint32_t, (uint32_t, uint32_t))
  265 +DEF_HELPER_1_2(neon_cgt_s32, uint32_t, (uint32_t, uint32_t))
  266 +DEF_HELPER_1_2(neon_cge_u8, uint32_t, (uint32_t, uint32_t))
  267 +DEF_HELPER_1_2(neon_cge_s8, uint32_t, (uint32_t, uint32_t))
  268 +DEF_HELPER_1_2(neon_cge_u16, uint32_t, (uint32_t, uint32_t))
  269 +DEF_HELPER_1_2(neon_cge_s16, uint32_t, (uint32_t, uint32_t))
  270 +DEF_HELPER_1_2(neon_cge_u32, uint32_t, (uint32_t, uint32_t))
  271 +DEF_HELPER_1_2(neon_cge_s32, uint32_t, (uint32_t, uint32_t))
  272 +
  273 +DEF_HELPER_1_2(neon_min_u8, uint32_t, (uint32_t, uint32_t))
  274 +DEF_HELPER_1_2(neon_min_s8, uint32_t, (uint32_t, uint32_t))
  275 +DEF_HELPER_1_2(neon_min_u16, uint32_t, (uint32_t, uint32_t))
  276 +DEF_HELPER_1_2(neon_min_s16, uint32_t, (uint32_t, uint32_t))
  277 +DEF_HELPER_1_2(neon_min_u32, uint32_t, (uint32_t, uint32_t))
  278 +DEF_HELPER_1_2(neon_min_s32, uint32_t, (uint32_t, uint32_t))
  279 +DEF_HELPER_1_2(neon_max_u8, uint32_t, (uint32_t, uint32_t))
  280 +DEF_HELPER_1_2(neon_max_s8, uint32_t, (uint32_t, uint32_t))
  281 +DEF_HELPER_1_2(neon_max_u16, uint32_t, (uint32_t, uint32_t))
  282 +DEF_HELPER_1_2(neon_max_s16, uint32_t, (uint32_t, uint32_t))
  283 +DEF_HELPER_1_2(neon_max_u32, uint32_t, (uint32_t, uint32_t))
  284 +DEF_HELPER_1_2(neon_max_s32, uint32_t, (uint32_t, uint32_t))
  285 +DEF_HELPER_1_2(neon_pmin_u8, uint32_t, (uint32_t, uint32_t))
  286 +DEF_HELPER_1_2(neon_pmin_s8, uint32_t, (uint32_t, uint32_t))
  287 +DEF_HELPER_1_2(neon_pmin_u16, uint32_t, (uint32_t, uint32_t))
  288 +DEF_HELPER_1_2(neon_pmin_s16, uint32_t, (uint32_t, uint32_t))
  289 +DEF_HELPER_1_2(neon_pmin_u32, uint32_t, (uint32_t, uint32_t))
  290 +DEF_HELPER_1_2(neon_pmin_s32, uint32_t, (uint32_t, uint32_t))
  291 +DEF_HELPER_1_2(neon_pmax_u8, uint32_t, (uint32_t, uint32_t))
  292 +DEF_HELPER_1_2(neon_pmax_s8, uint32_t, (uint32_t, uint32_t))
  293 +DEF_HELPER_1_2(neon_pmax_u16, uint32_t, (uint32_t, uint32_t))
  294 +DEF_HELPER_1_2(neon_pmax_s16, uint32_t, (uint32_t, uint32_t))
  295 +DEF_HELPER_1_2(neon_pmax_u32, uint32_t, (uint32_t, uint32_t))
  296 +DEF_HELPER_1_2(neon_pmax_s32, uint32_t, (uint32_t, uint32_t))
  297 +
  298 +DEF_HELPER_1_2(neon_abd_u8, uint32_t, (uint32_t, uint32_t))
  299 +DEF_HELPER_1_2(neon_abd_s8, uint32_t, (uint32_t, uint32_t))
  300 +DEF_HELPER_1_2(neon_abd_u16, uint32_t, (uint32_t, uint32_t))
  301 +DEF_HELPER_1_2(neon_abd_s16, uint32_t, (uint32_t, uint32_t))
  302 +DEF_HELPER_1_2(neon_abd_u32, uint32_t, (uint32_t, uint32_t))
  303 +DEF_HELPER_1_2(neon_abd_s32, uint32_t, (uint32_t, uint32_t))
  304 +
  305 +DEF_HELPER_1_2(neon_shl_u8, uint32_t, (uint32_t, uint32_t))
  306 +DEF_HELPER_1_2(neon_shl_s8, uint32_t, (uint32_t, uint32_t))
  307 +DEF_HELPER_1_2(neon_shl_u16, uint32_t, (uint32_t, uint32_t))
  308 +DEF_HELPER_1_2(neon_shl_s16, uint32_t, (uint32_t, uint32_t))
  309 +DEF_HELPER_1_2(neon_shl_u32, uint32_t, (uint32_t, uint32_t))
  310 +DEF_HELPER_1_2(neon_shl_s32, uint32_t, (uint32_t, uint32_t))
  311 +DEF_HELPER_1_2(neon_shl_u64, uint64_t, (uint64_t, uint64_t))
  312 +DEF_HELPER_1_2(neon_shl_s64, uint64_t, (uint64_t, uint64_t))
  313 +DEF_HELPER_1_2(neon_rshl_u8, uint32_t, (uint32_t, uint32_t))
  314 +DEF_HELPER_1_2(neon_rshl_s8, uint32_t, (uint32_t, uint32_t))
  315 +DEF_HELPER_1_2(neon_rshl_u16, uint32_t, (uint32_t, uint32_t))
  316 +DEF_HELPER_1_2(neon_rshl_s16, uint32_t, (uint32_t, uint32_t))
  317 +DEF_HELPER_1_2(neon_rshl_u32, uint32_t, (uint32_t, uint32_t))
  318 +DEF_HELPER_1_2(neon_rshl_s32, uint32_t, (uint32_t, uint32_t))
  319 +DEF_HELPER_1_2(neon_rshl_u64, uint64_t, (uint64_t, uint64_t))
  320 +DEF_HELPER_1_2(neon_rshl_s64, uint64_t, (uint64_t, uint64_t))
  321 +DEF_HELPER_1_3(neon_qshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  322 +DEF_HELPER_1_3(neon_qshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  323 +DEF_HELPER_1_3(neon_qshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  324 +DEF_HELPER_1_3(neon_qshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  325 +DEF_HELPER_1_3(neon_qshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  326 +DEF_HELPER_1_3(neon_qshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  327 +DEF_HELPER_1_3(neon_qshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  328 +DEF_HELPER_1_3(neon_qshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  329 +DEF_HELPER_1_3(neon_qrshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  330 +DEF_HELPER_1_3(neon_qrshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  331 +DEF_HELPER_1_3(neon_qrshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  332 +DEF_HELPER_1_3(neon_qrshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  333 +DEF_HELPER_1_3(neon_qrshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  334 +DEF_HELPER_1_3(neon_qrshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  335 +DEF_HELPER_1_3(neon_qrshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  336 +DEF_HELPER_1_3(neon_qrshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  337 +
  338 +DEF_HELPER_1_2(neon_add_u8, uint32_t, (uint32_t, uint32_t))
  339 +DEF_HELPER_1_2(neon_add_u16, uint32_t, (uint32_t, uint32_t))
  340 +DEF_HELPER_1_2(neon_padd_u8, uint32_t, (uint32_t, uint32_t))
  341 +DEF_HELPER_1_2(neon_padd_u16, uint32_t, (uint32_t, uint32_t))
  342 +DEF_HELPER_1_2(neon_sub_u8, uint32_t, (uint32_t, uint32_t))
  343 +DEF_HELPER_1_2(neon_sub_u16, uint32_t, (uint32_t, uint32_t))
  344 +DEF_HELPER_1_2(neon_mul_u8, uint32_t, (uint32_t, uint32_t))
  345 +DEF_HELPER_1_2(neon_mul_u16, uint32_t, (uint32_t, uint32_t))
  346 +DEF_HELPER_1_2(neon_mul_p8, uint32_t, (uint32_t, uint32_t))
  347 +
  348 +DEF_HELPER_1_2(neon_tst_u8, uint32_t, (uint32_t, uint32_t))
  349 +DEF_HELPER_1_2(neon_tst_u16, uint32_t, (uint32_t, uint32_t))
  350 +DEF_HELPER_1_2(neon_tst_u32, uint32_t, (uint32_t, uint32_t))
  351 +DEF_HELPER_1_2(neon_ceq_u8, uint32_t, (uint32_t, uint32_t))
  352 +DEF_HELPER_1_2(neon_ceq_u16, uint32_t, (uint32_t, uint32_t))
  353 +DEF_HELPER_1_2(neon_ceq_u32, uint32_t, (uint32_t, uint32_t))
  354 +
  355 +DEF_HELPER_1_1(neon_abs_s8, uint32_t, (uint32_t))
  356 +DEF_HELPER_1_1(neon_abs_s16, uint32_t, (uint32_t))
  357 +DEF_HELPER_1_1(neon_clz_u8, uint32_t, (uint32_t))
  358 +DEF_HELPER_1_1(neon_clz_u16, uint32_t, (uint32_t))
  359 +DEF_HELPER_1_1(neon_cls_s8, uint32_t, (uint32_t))
  360 +DEF_HELPER_1_1(neon_cls_s16, uint32_t, (uint32_t))
  361 +DEF_HELPER_1_1(neon_cls_s32, uint32_t, (uint32_t))
  362 +DEF_HELPER_1_1(neon_cnt_u8, uint32_t, (uint32_t))
  363 +
  364 +DEF_HELPER_1_3(neon_qdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  365 +DEF_HELPER_1_3(neon_qrdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  366 +DEF_HELPER_1_3(neon_qdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  367 +DEF_HELPER_1_3(neon_qrdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  368 +
  369 +DEF_HELPER_1_1(neon_narrow_u8, uint32_t, (uint64_t))
  370 +DEF_HELPER_1_1(neon_narrow_u16, uint32_t, (uint64_t))
  371 +DEF_HELPER_1_2(neon_narrow_sat_u8, uint32_t, (CPUState *, uint64_t))
  372 +DEF_HELPER_1_2(neon_narrow_sat_s8, uint32_t, (CPUState *, uint64_t))
  373 +DEF_HELPER_1_2(neon_narrow_sat_u16, uint32_t, (CPUState *, uint64_t))
  374 +DEF_HELPER_1_2(neon_narrow_sat_s16, uint32_t, (CPUState *, uint64_t))
  375 +DEF_HELPER_1_2(neon_narrow_sat_u32, uint32_t, (CPUState *, uint64_t))
  376 +DEF_HELPER_1_2(neon_narrow_sat_s32, uint32_t, (CPUState *, uint64_t))
  377 +DEF_HELPER_1_1(neon_narrow_high_u8, uint32_t, (uint64_t))
  378 +DEF_HELPER_1_1(neon_narrow_high_u16, uint32_t, (uint64_t))
  379 +DEF_HELPER_1_1(neon_narrow_round_high_u8, uint32_t, (uint64_t))
  380 +DEF_HELPER_1_1(neon_narrow_round_high_u16, uint32_t, (uint64_t))
  381 +DEF_HELPER_1_1(neon_widen_u8, uint64_t, (uint32_t))
  382 +DEF_HELPER_1_1(neon_widen_s8, uint64_t, (uint32_t))
  383 +DEF_HELPER_1_1(neon_widen_u16, uint64_t, (uint32_t))
  384 +DEF_HELPER_1_1(neon_widen_s16, uint64_t, (uint32_t))
  385 +
  386 +DEF_HELPER_1_2(neon_addl_u16, uint64_t, (uint64_t, uint64_t))
  387 +DEF_HELPER_1_2(neon_addl_u32, uint64_t, (uint64_t, uint64_t))
  388 +DEF_HELPER_1_2(neon_paddl_u16, uint64_t, (uint64_t, uint64_t))
  389 +DEF_HELPER_1_2(neon_paddl_u32, uint64_t, (uint64_t, uint64_t))
  390 +DEF_HELPER_1_2(neon_subl_u16, uint64_t, (uint64_t, uint64_t))
  391 +DEF_HELPER_1_2(neon_subl_u32, uint64_t, (uint64_t, uint64_t))
  392 +DEF_HELPER_1_3(neon_addl_saturate_s32, uint64_t, (CPUState *, uint64_t, uint64_t))
  393 +DEF_HELPER_1_3(neon_addl_saturate_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  394 +DEF_HELPER_1_2(neon_abdl_u16, uint64_t, (uint32_t, uint32_t))
  395 +DEF_HELPER_1_2(neon_abdl_s16, uint64_t, (uint32_t, uint32_t))
  396 +DEF_HELPER_1_2(neon_abdl_u32, uint64_t, (uint32_t, uint32_t))
  397 +DEF_HELPER_1_2(neon_abdl_s32, uint64_t, (uint32_t, uint32_t))
  398 +DEF_HELPER_1_2(neon_abdl_u64, uint64_t, (uint32_t, uint32_t))
  399 +DEF_HELPER_1_2(neon_abdl_s64, uint64_t, (uint32_t, uint32_t))
  400 +DEF_HELPER_1_2(neon_mull_u8, uint64_t, (uint32_t, uint32_t))
  401 +DEF_HELPER_1_2(neon_mull_s8, uint64_t, (uint32_t, uint32_t))
  402 +DEF_HELPER_1_2(neon_mull_u16, uint64_t, (uint32_t, uint32_t))
  403 +DEF_HELPER_1_2(neon_mull_s16, uint64_t, (uint32_t, uint32_t))
  404 +
  405 +DEF_HELPER_1_1(neon_negl_u16, uint64_t, (uint64_t))
  406 +DEF_HELPER_1_1(neon_negl_u32, uint64_t, (uint64_t))
  407 +DEF_HELPER_1_1(neon_negl_u64, uint64_t, (uint64_t))
  408 +
  409 +DEF_HELPER_1_2(neon_qabs_s8, uint32_t, (CPUState *, uint32_t))
  410 +DEF_HELPER_1_2(neon_qabs_s16, uint32_t, (CPUState *, uint32_t))
  411 +DEF_HELPER_1_2(neon_qabs_s32, uint32_t, (CPUState *, uint32_t))
  412 +DEF_HELPER_1_2(neon_qneg_s8, uint32_t, (CPUState *, uint32_t))
  413 +DEF_HELPER_1_2(neon_qneg_s16, uint32_t, (CPUState *, uint32_t))
  414 +DEF_HELPER_1_2(neon_qneg_s32, uint32_t, (CPUState *, uint32_t))
  415 +
  416 +DEF_HELPER_0_0(neon_trn_u8, void, (void))
  417 +DEF_HELPER_0_0(neon_trn_u16, void, (void))
  418 +DEF_HELPER_0_0(neon_unzip_u8, void, (void))
  419 +DEF_HELPER_0_0(neon_zip_u8, void, (void))
  420 +DEF_HELPER_0_0(neon_zip_u16, void, (void))
  421 +
  422 +DEF_HELPER_1_2(neon_min_f32, uint32_t, (uint32_t, uint32_t))
  423 +DEF_HELPER_1_2(neon_max_f32, uint32_t, (uint32_t, uint32_t))
  424 +DEF_HELPER_1_2(neon_abd_f32, uint32_t, (uint32_t, uint32_t))
  425 +DEF_HELPER_1_2(neon_add_f32, uint32_t, (uint32_t, uint32_t))
  426 +DEF_HELPER_1_2(neon_sub_f32, uint32_t, (uint32_t, uint32_t))
  427 +DEF_HELPER_1_2(neon_mul_f32, uint32_t, (uint32_t, uint32_t))
  428 +DEF_HELPER_1_2(neon_ceq_f32, uint32_t, (uint32_t, uint32_t))
  429 +DEF_HELPER_1_2(neon_cge_f32, uint32_t, (uint32_t, uint32_t))
  430 +DEF_HELPER_1_2(neon_cgt_f32, uint32_t, (uint32_t, uint32_t))
  431 +DEF_HELPER_1_2(neon_acge_f32, uint32_t, (uint32_t, uint32_t))
  432 +DEF_HELPER_1_2(neon_acgt_f32, uint32_t, (uint32_t, uint32_t))
  433 +
226 434 #undef DEF_HELPER
227 435 #undef DEF_HELPER_0_0
228 436 #undef DEF_HELPER_0_1
... ...
target-arm/neon_helper.c 0 โ†’ 100644
  1 +#include <stdlib.h>
  2 +#include <stdio.h>
  3 +
  4 +#include "cpu.h"
  5 +#include "exec-all.h"
  6 +#include "helpers.h"
  7 +
  8 +#define SIGNBIT (uint32_t)0x80000000
  9 +#define SIGNBIT64 ((uint64_t)1 << 63)
  10 +
  11 +#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
  12 +
  13 +static float_status neon_float_status;
  14 +#define NFS &neon_float_status
  15 +
  16 +/* Helper routines to perform bitwise copies between float and int. */
  17 +static inline float32 vfp_itos(uint32_t i)
  18 +{
  19 + union {
  20 + uint32_t i;
  21 + float32 s;
  22 + } v;
  23 +
  24 + v.i = i;
  25 + return v.s;
  26 +}
  27 +
  28 +static inline uint32_t vfp_stoi(float32 s)
  29 +{
  30 + union {
  31 + uint32_t i;
  32 + float32 s;
  33 + } v;
  34 +
  35 + v.s = s;
  36 + return v.i;
  37 +}
  38 +
  39 +#define NEON_TYPE1(name, type) \
  40 +typedef struct \
  41 +{ \
  42 + type v1; \
  43 +} neon_##name;
  44 +#ifdef WORDS_BIGENDIAN
  45 +#define NEON_TYPE2(name, type) \
  46 +typedef struct \
  47 +{ \
  48 + type v2; \
  49 + type v1; \
  50 +} neon_##name;
  51 +#define NEON_TYPE4(name, type) \
  52 +typedef struct \
  53 +{ \
  54 + type v4; \
  55 + type v3; \
  56 + type v2; \
  57 + type v1; \
  58 +} neon_##name;
  59 +#else
  60 +#define NEON_TYPE2(name, type) \
  61 +typedef struct \
  62 +{ \
  63 + type v1; \
  64 + type v2; \
  65 +} neon_##name;
  66 +#define NEON_TYPE4(name, type) \
  67 +typedef struct \
  68 +{ \
  69 + type v1; \
  70 + type v2; \
  71 + type v3; \
  72 + type v4; \
  73 +} neon_##name;
  74 +#endif
  75 +
  76 +NEON_TYPE4(s8, int8_t)
  77 +NEON_TYPE4(u8, uint8_t)
  78 +NEON_TYPE2(s16, int16_t)
  79 +NEON_TYPE2(u16, uint16_t)
  80 +NEON_TYPE1(s32, int32_t)
  81 +NEON_TYPE1(u32, uint32_t)
  82 +#undef NEON_TYPE4
  83 +#undef NEON_TYPE2
  84 +#undef NEON_TYPE1
  85 +
  86 +/* Copy from a uint32_t to a vector structure type. */
  87 +#define NEON_UNPACK(vtype, dest, val) do { \
  88 + union { \
  89 + vtype v; \
  90 + uint32_t i; \
  91 + } conv_u; \
  92 + conv_u.i = (val); \
  93 + dest = conv_u.v; \
  94 + } while(0)
  95 +
  96 +/* Copy from a vector structure type to a uint32_t. */
  97 +#define NEON_PACK(vtype, dest, val) do { \
  98 + union { \
  99 + vtype v; \
  100 + uint32_t i; \
  101 + } conv_u; \
  102 + conv_u.v = (val); \
  103 + dest = conv_u.i; \
  104 + } while(0)
  105 +
  106 +#define NEON_DO1 \
  107 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
  108 +#define NEON_DO2 \
  109 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  110 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
  111 +#define NEON_DO4 \
  112 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  113 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
  114 + NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
  115 + NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
  116 +
  117 +#define NEON_VOP_BODY(vtype, n) \
  118 +{ \
  119 + uint32_t res; \
  120 + vtype vsrc1; \
  121 + vtype vsrc2; \
  122 + vtype vdest; \
  123 + NEON_UNPACK(vtype, vsrc1, arg1); \
  124 + NEON_UNPACK(vtype, vsrc2, arg2); \
  125 + NEON_DO##n; \
  126 + NEON_PACK(vtype, res, vdest); \
  127 + return res; \
  128 +}
  129 +
  130 +#define NEON_VOP(name, vtype, n) \
  131 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  132 +NEON_VOP_BODY(vtype, n)
  133 +
  134 +#define NEON_VOP_ENV(name, vtype, n) \
  135 +uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
  136 +NEON_VOP_BODY(vtype, n)
  137 +
  138 +/* Pairwise operations. */
  139 +/* For 32-bit elements each segment only contains a single element, so
  140 + the elementwise and pairwise operations are the same. */
  141 +#define NEON_PDO2 \
  142 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  143 + NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
  144 +#define NEON_PDO4 \
  145 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  146 + NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
  147 + NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
  148 + NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
  149 +
  150 +#define NEON_POP(name, vtype, n) \
  151 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  152 +{ \
  153 + uint32_t res; \
  154 + vtype vsrc1; \
  155 + vtype vsrc2; \
  156 + vtype vdest; \
  157 + NEON_UNPACK(vtype, vsrc1, arg1); \
  158 + NEON_UNPACK(vtype, vsrc2, arg2); \
  159 + NEON_PDO##n; \
  160 + NEON_PACK(vtype, res, vdest); \
  161 + return res; \
  162 +}
  163 +
  164 +/* Unary operators. */
  165 +#define NEON_VOP1(name, vtype, n) \
  166 +uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
  167 +{ \
  168 + vtype vsrc1; \
  169 + vtype vdest; \
  170 + NEON_UNPACK(vtype, vsrc1, arg); \
  171 + NEON_DO##n; \
  172 + NEON_PACK(vtype, arg, vdest); \
  173 + return arg; \
  174 +}
  175 +
  176 +
  177 +#define NEON_USAT(dest, src1, src2, type) do { \
  178 + uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  179 + if (tmp != (type)tmp) { \
  180 + SET_QC(); \
  181 + dest = ~0; \
  182 + } else { \
  183 + dest = tmp; \
  184 + }} while(0)
  185 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  186 +NEON_VOP_ENV(qadd_u8, neon_u8, 4)
  187 +#undef NEON_FN
  188 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  189 +NEON_VOP_ENV(qadd_u16, neon_u16, 2)
  190 +#undef NEON_FN
  191 +#undef NEON_USAT
  192 +
  193 +#define NEON_SSAT(dest, src1, src2, type) do { \
  194 + int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  195 + if (tmp != (type)tmp) { \
  196 + SET_QC(); \
  197 + if (src2 > 0) { \
  198 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  199 + } else { \
  200 + tmp = 1 << (sizeof(type) * 8 - 1); \
  201 + } \
  202 + } \
  203 + dest = tmp; \
  204 + } while(0)
  205 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  206 +NEON_VOP_ENV(qadd_s8, neon_s8, 4)
  207 +#undef NEON_FN
  208 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  209 +NEON_VOP_ENV(qadd_s16, neon_s16, 2)
  210 +#undef NEON_FN
  211 +#undef NEON_SSAT
  212 +
  213 +#define NEON_USAT(dest, src1, src2, type) do { \
  214 + uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  215 + if (tmp != (type)tmp) { \
  216 + SET_QC(); \
  217 + dest = 0; \
  218 + } else { \
  219 + dest = tmp; \
  220 + }} while(0)
  221 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  222 +NEON_VOP_ENV(qsub_u8, neon_u8, 4)
  223 +#undef NEON_FN
  224 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  225 +NEON_VOP_ENV(qsub_u16, neon_u16, 2)
  226 +#undef NEON_FN
  227 +#undef NEON_USAT
  228 +
  229 +#define NEON_SSAT(dest, src1, src2, type) do { \
  230 + int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  231 + if (tmp != (type)tmp) { \
  232 + SET_QC(); \
  233 + if (src2 < 0) { \
  234 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  235 + } else { \
  236 + tmp = 1 << (sizeof(type) * 8 - 1); \
  237 + } \
  238 + } \
  239 + dest = tmp; \
  240 + } while(0)
  241 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  242 +NEON_VOP_ENV(qsub_s8, neon_s8, 4)
  243 +#undef NEON_FN
  244 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  245 +NEON_VOP_ENV(qsub_s16, neon_s16, 2)
  246 +#undef NEON_FN
  247 +#undef NEON_SSAT
  248 +
  249 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
  250 +NEON_VOP(hadd_s8, neon_s8, 4)
  251 +NEON_VOP(hadd_u8, neon_u8, 4)
  252 +NEON_VOP(hadd_s16, neon_s16, 2)
  253 +NEON_VOP(hadd_u16, neon_u16, 2)
  254 +#undef NEON_FN
  255 +
  256 +int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
  257 +{
  258 + int32_t dest;
  259 +
  260 + dest = (src1 >> 1) + (src2 >> 1);
  261 + if (src1 & src2 & 1)
  262 + dest++;
  263 + return dest;
  264 +}
  265 +
  266 +uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
  267 +{
  268 + uint32_t dest;
  269 +
  270 + dest = (src1 >> 1) + (src2 >> 1);
  271 + if (src1 & src2 & 1)
  272 + dest++;
  273 + return dest;
  274 +}
  275 +
  276 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
  277 +NEON_VOP(rhadd_s8, neon_s8, 4)
  278 +NEON_VOP(rhadd_u8, neon_u8, 4)
  279 +NEON_VOP(rhadd_s16, neon_s16, 2)
  280 +NEON_VOP(rhadd_u16, neon_u16, 2)
  281 +#undef NEON_FN
  282 +
  283 +int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
  284 +{
  285 + int32_t dest;
  286 +
  287 + dest = (src1 >> 1) + (src2 >> 1);
  288 + if ((src1 | src2) & 1)
  289 + dest++;
  290 + return dest;
  291 +}
  292 +
  293 +uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
  294 +{
  295 + uint32_t dest;
  296 +
  297 + dest = (src1 >> 1) + (src2 >> 1);
  298 + if ((src1 | src2) & 1)
  299 + dest++;
  300 + return dest;
  301 +}
  302 +
  303 +#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
  304 +NEON_VOP(hsub_s8, neon_s8, 4)
  305 +NEON_VOP(hsub_u8, neon_u8, 4)
  306 +NEON_VOP(hsub_s16, neon_s16, 2)
  307 +NEON_VOP(hsub_u16, neon_u16, 2)
  308 +#undef NEON_FN
  309 +
  310 +int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
  311 +{
  312 + int32_t dest;
  313 +
  314 + dest = (src1 >> 1) - (src2 >> 1);
  315 + if ((~src1) & src2 & 1)
  316 + dest--;
  317 + return dest;
  318 +}
  319 +
  320 +uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
  321 +{
  322 + uint32_t dest;
  323 +
  324 + dest = (src1 >> 1) - (src2 >> 1);
  325 + if ((~src1) & src2 & 1)
  326 + dest--;
  327 + return dest;
  328 +}
  329 +
  330 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
  331 +NEON_VOP(cgt_s8, neon_s8, 4)
  332 +NEON_VOP(cgt_u8, neon_u8, 4)
  333 +NEON_VOP(cgt_s16, neon_s16, 2)
  334 +NEON_VOP(cgt_u16, neon_u16, 2)
  335 +NEON_VOP(cgt_s32, neon_s32, 1)
  336 +NEON_VOP(cgt_u32, neon_u32, 1)
  337 +#undef NEON_FN
  338 +
  339 +#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
  340 +NEON_VOP(cge_s8, neon_s8, 4)
  341 +NEON_VOP(cge_u8, neon_u8, 4)
  342 +NEON_VOP(cge_s16, neon_s16, 2)
  343 +NEON_VOP(cge_u16, neon_u16, 2)
  344 +NEON_VOP(cge_s32, neon_s32, 1)
  345 +NEON_VOP(cge_u32, neon_u32, 1)
  346 +#undef NEON_FN
  347 +
  348 +#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
  349 +NEON_VOP(min_s8, neon_s8, 4)
  350 +NEON_VOP(min_u8, neon_u8, 4)
  351 +NEON_VOP(min_s16, neon_s16, 2)
  352 +NEON_VOP(min_u16, neon_u16, 2)
  353 +NEON_VOP(min_s32, neon_s32, 1)
  354 +NEON_VOP(min_u32, neon_u32, 1)
  355 +NEON_POP(pmin_s8, neon_s8, 4)
  356 +NEON_POP(pmin_u8, neon_u8, 4)
  357 +NEON_POP(pmin_s16, neon_s16, 2)
  358 +NEON_POP(pmin_u16, neon_u16, 2)
  359 +#undef NEON_FN
  360 +
  361 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
  362 +NEON_VOP(max_s8, neon_s8, 4)
  363 +NEON_VOP(max_u8, neon_u8, 4)
  364 +NEON_VOP(max_s16, neon_s16, 2)
  365 +NEON_VOP(max_u16, neon_u16, 2)
  366 +NEON_VOP(max_s32, neon_s32, 1)
  367 +NEON_VOP(max_u32, neon_u32, 1)
  368 +NEON_POP(pmax_s8, neon_s8, 4)
  369 +NEON_POP(pmax_u8, neon_u8, 4)
  370 +NEON_POP(pmax_s16, neon_s16, 2)
  371 +NEON_POP(pmax_u16, neon_u16, 2)
  372 +#undef NEON_FN
  373 +
  374 +#define NEON_FN(dest, src1, src2) \
  375 + dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
  376 +NEON_VOP(abd_s8, neon_s8, 4)
  377 +NEON_VOP(abd_u8, neon_u8, 4)
  378 +NEON_VOP(abd_s16, neon_s16, 2)
  379 +NEON_VOP(abd_u16, neon_u16, 2)
  380 +NEON_VOP(abd_s32, neon_s32, 1)
  381 +NEON_VOP(abd_u32, neon_u32, 1)
  382 +#undef NEON_FN
  383 +
  384 +#define NEON_FN(dest, src1, src2) do { \
  385 + int8_t tmp; \
  386 + tmp = (int8_t)src2; \
  387 + if (tmp >= sizeof(src1) * 8 || tmp <= -sizeof(src1) * 8) { \
  388 + dest = 0; \
  389 + } else if (tmp < 0) { \
  390 + dest = src1 >> -tmp; \
  391 + } else { \
  392 + dest = src1 << tmp; \
  393 + }} while (0)
  394 +NEON_VOP(shl_u8, neon_u8, 4)
  395 +NEON_VOP(shl_u16, neon_u16, 2)
  396 +NEON_VOP(shl_u32, neon_u32, 1)
  397 +#undef NEON_FN
  398 +
  399 +uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
  400 +{
  401 + int8_t shift = (int8_t)shiftop;
  402 + if (shift >= 64 || shift <= -64) {
  403 + val = 0;
  404 + } else if (shift < 0) {
  405 + val >>= -shift;
  406 + } else {
  407 + val <<= shift;
  408 + }
  409 + return val;
  410 +}
  411 +
  412 +#define NEON_FN(dest, src1, src2) do { \
  413 + int8_t tmp; \
  414 + tmp = (int8_t)src2; \
  415 + if (tmp >= sizeof(src1) * 8) { \
  416 + dest = 0; \
  417 + } else if (tmp <= -sizeof(src1) * 8) { \
  418 + dest = src1 >> (sizeof(src1) * 8 - 1); \
  419 + } else if (tmp < 0) { \
  420 + dest = src1 >> -tmp; \
  421 + } else { \
  422 + dest = src1 << tmp; \
  423 + }} while (0)
  424 +NEON_VOP(shl_s8, neon_s8, 4)
  425 +NEON_VOP(shl_s16, neon_s16, 2)
  426 +NEON_VOP(shl_s32, neon_s32, 1)
  427 +#undef NEON_FN
  428 +
  429 +uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
  430 +{
  431 + int8_t shift = (int8_t)shiftop;
  432 + int64_t val = valop;
  433 + if (shift >= 64) {
  434 + val = 0;
  435 + } else if (shift <= -64) {
  436 + val >>= 63;
  437 + } else if (shift < 0) {
  438 + val >>= -shift;
  439 + } else {
  440 + val <<= shift;
  441 + }
  442 + return val;
  443 +}
  444 +
  445 +#define NEON_FN(dest, src1, src2) do { \
  446 + int8_t tmp; \
  447 + tmp = (int8_t)src2; \
  448 + if (tmp >= sizeof(src1) * 8) { \
  449 + dest = 0; \
  450 + } else if (tmp < -sizeof(src1) * 8) { \
  451 + dest >>= sizeof(src1) * 8 - 1; \
  452 + } else if (tmp == -sizeof(src1) * 8) { \
  453 + dest = src1 >> (tmp - 1); \
  454 + dest++; \
  455 + src2 >>= 1; \
  456 + } else if (tmp < 0) { \
  457 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  458 + } else { \
  459 + dest = src1 << tmp; \
  460 + }} while (0)
  461 +NEON_VOP(rshl_s8, neon_s8, 4)
  462 +NEON_VOP(rshl_s16, neon_s16, 2)
  463 +NEON_VOP(rshl_s32, neon_s32, 1)
  464 +#undef NEON_FN
  465 +
  466 +uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
  467 +{
  468 + int8_t shift = (int8_t)shiftop;
  469 + int64_t val = valop;
  470 + if (shift >= 64) {
  471 + val = 0;
  472 + } else if (shift < -64) {
  473 + val >>= 63;
  474 + } else if (shift == -63) {
  475 + val >>= 63;
  476 + val++;
  477 + val >>= 1;
  478 + } else if (shift < 0) {
  479 + val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
  480 + } else {
  481 + val <<= shift;
  482 + }
  483 + return val;
  484 +}
  485 +
  486 +#define NEON_FN(dest, src1, src2) do { \
  487 + int8_t tmp; \
  488 + tmp = (int8_t)src2; \
  489 + if (tmp >= sizeof(src1) * 8 || tmp < -sizeof(src1) * 8) { \
  490 + dest = 0; \
  491 + } else if (tmp == -sizeof(src1) * 8) { \
  492 + dest = src1 >> (tmp - 1); \
  493 + } else if (tmp < 0) { \
  494 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  495 + } else { \
  496 + dest = src1 << tmp; \
  497 + }} while (0)
  498 +NEON_VOP(rshl_u8, neon_u8, 4)
  499 +NEON_VOP(rshl_u16, neon_u16, 2)
  500 +NEON_VOP(rshl_u32, neon_u32, 1)
  501 +#undef NEON_FN
  502 +
  503 +uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
  504 +{
  505 + int8_t shift = (uint8_t)shiftop;
  506 + if (shift >= 64 || shift < 64) {
  507 + val = 0;
  508 + } else if (shift == -64) {
  509 + /* Rounding a 1-bit result just preserves that bit. */
  510 + val >>= 63;
  511 + } if (shift < 0) {
  512 + val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
  513 + val >>= -shift;
  514 + } else {
  515 + val <<= shift;
  516 + }
  517 + return val;
  518 +}
  519 +
  520 +#define NEON_FN(dest, src1, src2) do { \
  521 + int8_t tmp; \
  522 + tmp = (int8_t)src2; \
  523 + if (tmp >= sizeof(src1) * 8) { \
  524 + if (src1) { \
  525 + SET_QC(); \
  526 + dest = ~0; \
  527 + } else { \
  528 + dest = 0; \
  529 + } \
  530 + } else if (tmp <= -sizeof(src1) * 8) { \
  531 + dest = 0; \
  532 + } else if (tmp < 0) { \
  533 + dest = src1 >> -tmp; \
  534 + } else { \
  535 + dest = src1 << tmp; \
  536 + if ((dest >> tmp) != src1) { \
  537 + SET_QC(); \
  538 + dest = ~0; \
  539 + } \
  540 + }} while (0)
  541 +NEON_VOP_ENV(qshl_u8, neon_u8, 4)
  542 +NEON_VOP_ENV(qshl_u16, neon_u16, 2)
  543 +NEON_VOP_ENV(qshl_u32, neon_u32, 1)
  544 +#undef NEON_FN
  545 +
  546 +uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  547 +{
  548 + int8_t shift = (int8_t)shiftop;
  549 + if (shift >= 64) {
  550 + if (val) {
  551 + val = ~(uint64_t)0;
  552 + SET_QC();
  553 + } else {
  554 + val = 0;
  555 + }
  556 + } else if (shift <= -64) {
  557 + val = 0;
  558 + } else if (shift < 0) {
  559 + val >>= -shift;
  560 + } else {
  561 + uint64_t tmp = val;
  562 + val <<= shift;
  563 + if ((val >> shift) != tmp) {
  564 + SET_QC();
  565 + val = ~(uint64_t)0;
  566 + }
  567 + }
  568 + return val;
  569 +}
  570 +
  571 +#define NEON_FN(dest, src1, src2) do { \
  572 + int8_t tmp; \
  573 + tmp = (int8_t)src2; \
  574 + if (tmp >= sizeof(src1) * 8) { \
  575 + if (src1) \
  576 + SET_QC(); \
  577 + dest = src1 >> 31; \
  578 + } else if (tmp <= -sizeof(src1) * 8) { \
  579 + dest = src1 >> 31; \
  580 + } else if (tmp < 0) { \
  581 + dest = src1 >> -tmp; \
  582 + } else { \
  583 + dest = src1 << tmp; \
  584 + if ((dest >> tmp) != src1) { \
  585 + SET_QC(); \
  586 + dest = src2 >> 31; \
  587 + } \
  588 + }} while (0)
  589 +NEON_VOP_ENV(qshl_s8, neon_s8, 4)
  590 +NEON_VOP_ENV(qshl_s16, neon_s16, 2)
  591 +NEON_VOP_ENV(qshl_s32, neon_s32, 1)
  592 +#undef NEON_FN
  593 +
  594 +uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  595 +{
  596 + int8_t shift = (uint8_t)shiftop;
  597 + int64_t val = valop;
  598 + if (shift >= 64) {
  599 + if (val) {
  600 + SET_QC();
  601 + val = (val >> 63) & ~SIGNBIT64;
  602 + }
  603 + } else if (shift <= 64) {
  604 + val >>= 63;
  605 + } else if (shift < 0) {
  606 + val >>= -shift;
  607 + } else {
  608 + int64_t tmp = val;
  609 + val <<= shift;
  610 + if ((val >> shift) != tmp) {
  611 + SET_QC();
  612 + val = (tmp >> 63) ^ ~SIGNBIT64;
  613 + }
  614 + }
  615 + return val;
  616 +}
  617 +
  618 +
  619 +/* FIXME: This is wrong. */
  620 +#define NEON_FN(dest, src1, src2) do { \
  621 + int8_t tmp; \
  622 + tmp = (int8_t)src2; \
  623 + if (tmp < 0) { \
  624 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  625 + } else { \
  626 + dest = src1 << tmp; \
  627 + if ((dest >> tmp) != src1) { \
  628 + SET_QC(); \
  629 + dest = ~0; \
  630 + } \
  631 + }} while (0)
  632 +NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
  633 +NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
  634 +NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
  635 +#undef NEON_FN
  636 +
  637 +uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  638 +{
  639 + int8_t shift = (int8_t)shiftop;
  640 + if (shift < 0) {
  641 + val = (val + (1 << (-1 - shift))) >> -shift;
  642 + } else { \
  643 + uint64_t tmp = val;
  644 + val <<= shift;
  645 + if ((val >> shift) != tmp) {
  646 + SET_QC();
  647 + val = ~0;
  648 + }
  649 + }
  650 + return val;
  651 +}
  652 +
  653 +#define NEON_FN(dest, src1, src2) do { \
  654 + int8_t tmp; \
  655 + tmp = (int8_t)src2; \
  656 + if (tmp < 0) { \
  657 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  658 + } else { \
  659 + dest = src1 << tmp; \
  660 + if ((dest >> tmp) != src1) { \
  661 + SET_QC(); \
  662 + dest = src1 >> 31; \
  663 + } \
  664 + }} while (0)
  665 +NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
  666 +NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
  667 +NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
  668 +#undef NEON_FN
  669 +
  670 +uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  671 +{
  672 + int8_t shift = (uint8_t)shiftop;
  673 + int64_t val = valop;
  674 +
  675 + if (shift < 0) {
  676 + val = (val + (1 << (-1 - shift))) >> -shift;
  677 + } else {
  678 + int64_t tmp = val;;
  679 + val <<= shift;
  680 + if ((val >> shift) != tmp) {
  681 + SET_QC();
  682 + val = tmp >> 31;
  683 + }
  684 + }
  685 + return val;
  686 +}
  687 +
  688 +uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
  689 +{
  690 + uint32_t mask;
  691 + mask = (a ^ b) & 0x80808080u;
  692 + a &= ~0x80808080u;
  693 + b &= ~0x80808080u;
  694 + return (a + b) ^ mask;
  695 +}
  696 +
  697 +uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
  698 +{
  699 + uint32_t mask;
  700 + mask = (a ^ b) & 0x80008000u;
  701 + a &= ~0x80008000u;
  702 + b &= ~0x80008000u;
  703 + return (a + b) ^ mask;
  704 +}
  705 +
  706 +#define NEON_FN(dest, src1, src2) dest = src1 + src2
  707 +NEON_POP(padd_u8, neon_u8, 4)
  708 +NEON_POP(padd_u16, neon_u16, 2)
  709 +#undef NEON_FN
  710 +
  711 +#define NEON_FN(dest, src1, src2) dest = src1 - src2
  712 +NEON_VOP(sub_u8, neon_u8, 4)
  713 +NEON_VOP(sub_u16, neon_u16, 2)
  714 +#undef NEON_FN
  715 +
  716 +#define NEON_FN(dest, src1, src2) dest = src1 * src2
  717 +NEON_VOP(mul_u8, neon_u8, 4)
  718 +NEON_VOP(mul_u16, neon_u16, 2)
  719 +#undef NEON_FN
  720 +
  721 +/* Polynomial multiplication is like integer multiplcation except the
  722 + partial products are XORed, not added. */
  723 +uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
  724 +{
  725 + uint32_t mask;
  726 + uint32_t result;
  727 + result = 0;
  728 + while (op1) {
  729 + mask = 0;
  730 + if (op1 & 1)
  731 + mask |= 0xff;
  732 + if (op1 & (1 << 8))
  733 + mask |= (0xff << 8);
  734 + if (op1 & (1 << 16))
  735 + mask |= (0xff << 16);
  736 + if (op1 & (1 << 24))
  737 + mask |= (0xff << 24);
  738 + result ^= op2 & mask;
  739 + op1 = (op1 >> 1) & 0x7f7f7f7f;
  740 + op2 = (op2 << 1) & 0xfefefefe;
  741 + }
  742 + return result;
  743 +}
  744 +
  745 +#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
  746 +NEON_VOP(tst_u8, neon_u8, 4)
  747 +NEON_VOP(tst_u16, neon_u16, 2)
  748 +NEON_VOP(tst_u32, neon_u32, 1)
  749 +#undef NEON_FN
  750 +
  751 +#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
  752 +NEON_VOP(ceq_u8, neon_u8, 4)
  753 +NEON_VOP(ceq_u16, neon_u16, 2)
  754 +NEON_VOP(ceq_u32, neon_u32, 1)
  755 +#undef NEON_FN
  756 +
  757 +#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
  758 +NEON_VOP1(abs_s8, neon_s8, 4)
  759 +NEON_VOP1(abs_s16, neon_s16, 2)
  760 +#undef NEON_FN
  761 +
  762 +/* Count Leading Sign/Zero Bits. */
  763 +static inline int do_clz8(uint8_t x)
  764 +{
  765 + int n;
  766 + for (n = 8; x; n--)
  767 + x >>= 1;
  768 + return n;
  769 +}
  770 +
  771 +static inline int do_clz16(uint16_t x)
  772 +{
  773 + int n;
  774 + for (n = 16; x; n--)
  775 + x >>= 1;
  776 + return n;
  777 +}
  778 +
  779 +#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
  780 +NEON_VOP1(clz_u8, neon_u8, 4)
  781 +#undef NEON_FN
  782 +
  783 +#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
  784 +NEON_VOP1(clz_u16, neon_u16, 2)
  785 +#undef NEON_FN
  786 +
  787 +#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
  788 +NEON_VOP1(cls_s8, neon_s8, 4)
  789 +#undef NEON_FN
  790 +
  791 +#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
  792 +NEON_VOP1(cls_s16, neon_s16, 2)
  793 +#undef NEON_FN
  794 +
  795 +uint32_t HELPER(neon_cls_s32)(uint32_t x)
  796 +{
  797 + int count;
  798 + if ((int32_t)x < 0)
  799 + x = ~x;
  800 + for (count = 32; x; count--)
  801 + x = x >> 1;
  802 + return count - 1;
  803 +}
  804 +
  805 +/* Bit count. */
  806 +uint32_t HELPER(neon_cnt_u8)(uint32_t x)
  807 +{
  808 + x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
  809 + x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  810 + x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
  811 + return x;
  812 +}
  813 +
  814 +#define NEON_QDMULH16(dest, src1, src2, round) do { \
  815 + uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
  816 + if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
  817 + SET_QC(); \
  818 + tmp = (tmp >> 31) ^ ~SIGNBIT; \
  819 + } \
  820 + tmp <<= 1; \
  821 + if (round) { \
  822 + int32_t old = tmp; \
  823 + tmp += 1 << 15; \
  824 + if ((int32_t)tmp < old) { \
  825 + SET_QC(); \
  826 + tmp = SIGNBIT - 1; \
  827 + } \
  828 + } \
  829 + dest = tmp >> 16; \
  830 + } while(0)
  831 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
  832 +NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
  833 +#undef NEON_FN
  834 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
  835 +NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
  836 +#undef NEON_FN
  837 +#undef NEON_QDMULH16
  838 +
  839 +#define NEON_QDMULH32(dest, src1, src2, round) do { \
  840 + uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
  841 + if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
  842 + SET_QC(); \
  843 + tmp = (tmp >> 63) ^ ~SIGNBIT64; \
  844 + } else { \
  845 + tmp <<= 1; \
  846 + } \
  847 + if (round) { \
  848 + int64_t old = tmp; \
  849 + tmp += (int64_t)1 << 31; \
  850 + if ((int64_t)tmp < old) { \
  851 + SET_QC(); \
  852 + tmp = SIGNBIT64 - 1; \
  853 + } \
  854 + } \
  855 + dest = tmp >> 32; \
  856 + } while(0)
  857 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
  858 +NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
  859 +#undef NEON_FN
  860 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
  861 +NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
  862 +#undef NEON_FN
  863 +#undef NEON_QDMULH32
  864 +
  865 +uint32_t HELPER(neon_narrow_u8)(uint64_t x)
  866 +{
  867 + return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
  868 + | ((x >> 24) & 0xff000000u);
  869 +}
  870 +
  871 +uint32_t HELPER(neon_narrow_u16)(uint64_t x)
  872 +{
  873 + return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
  874 +}
  875 +
  876 +uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
  877 +{
  878 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  879 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  880 +}
  881 +
  882 +uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
  883 +{
  884 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  885 +}
  886 +
  887 +uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
  888 +{
  889 + x &= 0xff80ff80ff80ff80ull;
  890 + x += 0x0080008000800080ull;
  891 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  892 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  893 +}
  894 +
  895 +uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
  896 +{
  897 + x &= 0xffff8000ffff8000ull;
  898 + x += 0x0000800000008000ull;
  899 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  900 +}
  901 +
  902 +uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
  903 +{
  904 + uint16_t s;
  905 + uint8_t d;
  906 + uint32_t res = 0;
  907 +#define SAT8(n) \
  908 + s = x >> n; \
  909 + if (s > 0xff) { \
  910 + d = 0xff; \
  911 + SET_QC(); \
  912 + } else { \
  913 + d = s; \
  914 + } \
  915 + res |= (uint32_t)d << (n / 2);
  916 +
  917 + SAT8(0);
  918 + SAT8(16);
  919 + SAT8(32);
  920 + SAT8(48);
  921 +#undef SAT8
  922 + return res;
  923 +}
  924 +
  925 +uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
  926 +{
  927 + int16_t s;
  928 + uint8_t d;
  929 + uint32_t res = 0;
  930 +#define SAT8(n) \
  931 + s = x >> n; \
  932 + if (s != (int8_t)s) { \
  933 + d = (s >> 15) ^ 0x7f; \
  934 + SET_QC(); \
  935 + } else { \
  936 + d = s; \
  937 + } \
  938 + res |= (uint32_t)d << (n / 2);
  939 +
  940 + SAT8(0);
  941 + SAT8(16);
  942 + SAT8(32);
  943 + SAT8(48);
  944 +#undef SAT8
  945 + return res;
  946 +}
  947 +
  948 +uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
  949 +{
  950 + uint32_t high;
  951 + uint32_t low;
  952 + low = x;
  953 + if (low > 0xffff) {
  954 + low = 0xffff;
  955 + SET_QC();
  956 + }
  957 + high = x >> 32;
  958 + if (high > 0xffff) {
  959 + high = 0xffff;
  960 + SET_QC();
  961 + }
  962 + return low | (high << 16);
  963 +}
  964 +
  965 +uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
  966 +{
  967 + int32_t low;
  968 + int32_t high;
  969 + low = x;
  970 + if (low != (int16_t)low) {
  971 + low = (low >> 31) ^ 0x7fff;
  972 + SET_QC();
  973 + }
  974 + high = x >> 32;
  975 + if (high != (int16_t)high) {
  976 + high = (high >> 31) ^ 0x7fff;
  977 + SET_QC();
  978 + }
  979 + return (uint16_t)low | (high << 16);
  980 +}
  981 +
  982 +uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
  983 +{
  984 + if (x > 0xffffffffu) {
  985 + SET_QC();
  986 + return 0xffffffffu;
  987 + }
  988 + return x;
  989 +}
  990 +
  991 +uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
  992 +{
  993 + if ((int64_t)x != (int32_t)x) {
  994 + SET_QC();
  995 + return (x >> 63) ^ 0x7fffffff;
  996 + }
  997 + return x;
  998 +}
  999 +
  1000 +uint64_t HELPER(neon_widen_u8)(uint32_t x)
  1001 +{
  1002 + uint64_t tmp;
  1003 + uint64_t ret;
  1004 + ret = (uint8_t)x;
  1005 + tmp = (uint8_t)(x >> 8);
  1006 + ret |= tmp << 16;
  1007 + tmp = (uint8_t)(x >> 16);
  1008 + ret |= tmp << 32;
  1009 + tmp = (uint8_t)(x >> 24);
  1010 + ret |= tmp << 48;
  1011 + return ret;
  1012 +}
  1013 +
  1014 +uint64_t HELPER(neon_widen_s8)(uint32_t x)
  1015 +{
  1016 + uint64_t tmp;
  1017 + uint64_t ret;
  1018 + ret = (uint16_t)(int8_t)x;
  1019 + tmp = (uint16_t)(int8_t)(x >> 8);
  1020 + ret |= tmp << 16;
  1021 + tmp = (uint16_t)(int8_t)(x >> 16);
  1022 + ret |= tmp << 32;
  1023 + tmp = (uint16_t)(int8_t)(x >> 24);
  1024 + ret |= tmp << 48;
  1025 + return ret;
  1026 +}
  1027 +
  1028 +uint64_t HELPER(neon_widen_u16)(uint32_t x)
  1029 +{
  1030 + uint64_t high = (uint16_t)(x >> 16);
  1031 + return ((uint16_t)x) | (high << 32);
  1032 +}
  1033 +
  1034 +uint64_t HELPER(neon_widen_s16)(uint32_t x)
  1035 +{
  1036 + uint64_t high = (int16_t)(x >> 16);
  1037 + return ((uint32_t)(int16_t)x) | (high << 32);
  1038 +}
  1039 +
  1040 +uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
  1041 +{
  1042 + uint64_t mask;
  1043 + mask = (a ^ b) & 0x8000800080008000ull;
  1044 + a &= ~0x8000800080008000ull;
  1045 + b &= ~0x8000800080008000ull;
  1046 + return (a + b) ^ mask;
  1047 +}
  1048 +
  1049 +uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
  1050 +{
  1051 + uint64_t mask;
  1052 + mask = (a ^ b) & 0x8000000080000000ull;
  1053 + a &= ~0x8000000080000000ull;
  1054 + b &= ~0x8000000080000000ull;
  1055 + return (a + b) ^ mask;
  1056 +}
  1057 +
  1058 +uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
  1059 +{
  1060 + uint64_t tmp;
  1061 + uint64_t tmp2;
  1062 +
  1063 + tmp = a & 0x0000ffff0000ffffull;
  1064 + tmp += (a >> 16) & 0x0000ffff0000ffffull;
  1065 + tmp2 = b & 0xffff0000ffff0000ull;
  1066 + tmp2 += (b << 16) & 0xffff0000ffff0000ull;
  1067 + return ( tmp & 0xffff)
  1068 + | ((tmp >> 16) & 0xffff0000ull)
  1069 + | ((tmp2 << 16) & 0xffff00000000ull)
  1070 + | ( tmp2 & 0xffff000000000000ull);
  1071 +}
  1072 +
  1073 +uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
  1074 +{
  1075 + uint32_t low = a + (a >> 32);
  1076 + uint32_t high = b + (b >> 32);
  1077 + return low + ((uint64_t)high << 32);
  1078 +}
  1079 +
  1080 +uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
  1081 +{
  1082 + uint64_t mask;
  1083 + mask = (a ^ ~b) & 0x8000800080008000ull;
  1084 + a |= 0x8000800080008000ull;
  1085 + b &= ~0x8000800080008000ull;
  1086 + return (a - b) ^ mask;
  1087 +}
  1088 +
  1089 +uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
  1090 +{
  1091 + uint64_t mask;
  1092 + mask = (a ^ ~b) & 0x8000000080000000ull;
  1093 + a |= 0x8000000080000000ull;
  1094 + b &= ~0x8000000080000000ull;
  1095 + return (a - b) ^ mask;
  1096 +}
  1097 +
  1098 +uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
  1099 +{
  1100 + uint32_t x, y;
  1101 + uint32_t low, high;
  1102 +
  1103 + x = a;
  1104 + y = b;
  1105 + low = x + y;
  1106 + if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1107 + SET_QC();
  1108 + low = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1109 + }
  1110 + x = a >> 32;
  1111 + y = b >> 32;
  1112 + high = x + y;
  1113 + if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1114 + SET_QC();
  1115 + high = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1116 + }
  1117 + return low | ((uint64_t)high << 32);
  1118 +}
  1119 +
  1120 +uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
  1121 +{
  1122 + uint64_t result;
  1123 +
  1124 + result = a + b;
  1125 + if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
  1126 + SET_QC();
  1127 + result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
  1128 + }
  1129 + return result;
  1130 +}
  1131 +
  1132 +#define DO_ABD(dest, x, y, type) do { \
  1133 + type tmp_x = x; \
  1134 + type tmp_y = y; \
  1135 + dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
  1136 + } while(0)
  1137 +
  1138 +uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
  1139 +{
  1140 + uint64_t tmp;
  1141 + uint64_t result;
  1142 + DO_ABD(result, a, b, uint8_t);
  1143 + DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
  1144 + result |= tmp << 16;
  1145 + DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
  1146 + result |= tmp << 32;
  1147 + DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
  1148 + result |= tmp << 48;
  1149 + return result;
  1150 +}
  1151 +
  1152 +uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
  1153 +{
  1154 + uint64_t tmp;
  1155 + uint64_t result;
  1156 + DO_ABD(result, a, b, int8_t);
  1157 + DO_ABD(tmp, a >> 8, b >> 8, int8_t);
  1158 + result |= tmp << 16;
  1159 + DO_ABD(tmp, a >> 16, b >> 16, int8_t);
  1160 + result |= tmp << 32;
  1161 + DO_ABD(tmp, a >> 24, b >> 24, int8_t);
  1162 + result |= tmp << 48;
  1163 + return result;
  1164 +}
  1165 +
  1166 +uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
  1167 +{
  1168 + uint64_t tmp;
  1169 + uint64_t result;
  1170 + DO_ABD(result, a, b, uint16_t);
  1171 + DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
  1172 + return result | (tmp << 32);
  1173 +}
  1174 +
  1175 +uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
  1176 +{
  1177 + uint64_t tmp;
  1178 + uint64_t result;
  1179 + DO_ABD(result, a, b, int16_t);
  1180 + DO_ABD(tmp, a >> 16, b >> 16, int16_t);
  1181 + return result | (tmp << 32);
  1182 +}
  1183 +
  1184 +uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
  1185 +{
  1186 + uint64_t result;
  1187 + DO_ABD(result, a, b, uint32_t);
  1188 + return result;
  1189 +}
  1190 +
  1191 +uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
  1192 +{
  1193 + uint64_t result;
  1194 + DO_ABD(result, a, b, int32_t);
  1195 + return result;
  1196 +}
  1197 +#undef DO_ABD
  1198 +
  1199 +/* Widening multiply. Named type is the source type. */
  1200 +#define DO_MULL(dest, x, y, type1, type2) do { \
  1201 + type1 tmp_x = x; \
  1202 + type1 tmp_y = y; \
  1203 + dest = (type2)((type2)tmp_x * (type2)tmp_y); \
  1204 + } while(0)
  1205 +
  1206 +uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
  1207 +{
  1208 + uint64_t tmp;
  1209 + uint64_t result;
  1210 +
  1211 + DO_MULL(result, a, b, uint8_t, uint16_t);
  1212 + DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
  1213 + result |= tmp << 16;
  1214 + DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
  1215 + result |= tmp << 32;
  1216 + DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
  1217 + result |= tmp << 48;
  1218 + return result;
  1219 +}
  1220 +
  1221 +uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
  1222 +{
  1223 + uint64_t tmp;
  1224 + uint64_t result;
  1225 +
  1226 + DO_MULL(result, a, b, int8_t, uint16_t);
  1227 + DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
  1228 + result |= tmp << 16;
  1229 + DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
  1230 + result |= tmp << 32;
  1231 + DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
  1232 + result |= tmp << 48;
  1233 + return result;
  1234 +}
  1235 +
  1236 +uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
  1237 +{
  1238 + uint64_t tmp;
  1239 + uint64_t result;
  1240 +
  1241 + DO_MULL(result, a, b, uint16_t, uint32_t);
  1242 + DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
  1243 + return result | (tmp << 32);
  1244 +}
  1245 +
  1246 +uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
  1247 +{
  1248 + uint64_t tmp;
  1249 + uint64_t result;
  1250 +
  1251 + DO_MULL(result, a, b, int16_t, uint32_t);
  1252 + DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
  1253 + return result | (tmp << 32);
  1254 +}
  1255 +
  1256 +uint64_t HELPER(neon_negl_u16)(uint64_t x)
  1257 +{
  1258 + uint16_t tmp;
  1259 + uint64_t result;
  1260 + result = (uint16_t)-x;
  1261 + tmp = -(x >> 16);
  1262 + result |= (uint64_t)tmp << 16;
  1263 + tmp = -(x >> 32);
  1264 + result |= (uint64_t)tmp << 32;
  1265 + tmp = -(x >> 48);
  1266 + result |= (uint64_t)tmp << 48;
  1267 + return result;
  1268 +}
  1269 +
  1270 +#include <stdio.h>
  1271 +uint64_t HELPER(neon_negl_u32)(uint64_t x)
  1272 +{
  1273 + uint32_t low = -x;
  1274 + uint32_t high = -(x >> 32);
  1275 + return low | ((uint64_t)high << 32);
  1276 +}
  1277 +
  1278 +/* FIXME: There should be a native op for this. */
  1279 +uint64_t HELPER(neon_negl_u64)(uint64_t x)
  1280 +{
  1281 + return -x;
  1282 +}
  1283 +
  1284 +/* Saturnating sign manuipulation. */
  1285 +/* ??? Make these use NEON_VOP1 */
  1286 +#define DO_QABS8(x) do { \
  1287 + if (x == (int8_t)0x80) { \
  1288 + x = 0x7f; \
  1289 + SET_QC(); \
  1290 + } else if (x < 0) { \
  1291 + x = -x; \
  1292 + }} while (0)
  1293 +uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
  1294 +{
  1295 + neon_s8 vec;
  1296 + NEON_UNPACK(neon_s8, vec, x);
  1297 + DO_QABS8(vec.v1);
  1298 + DO_QABS8(vec.v2);
  1299 + DO_QABS8(vec.v3);
  1300 + DO_QABS8(vec.v4);
  1301 + NEON_PACK(neon_s8, x, vec);
  1302 + return x;
  1303 +}
  1304 +#undef DO_QABS8
  1305 +
  1306 +#define DO_QNEG8(x) do { \
  1307 + if (x == (int8_t)0x80) { \
  1308 + x = 0x7f; \
  1309 + SET_QC(); \
  1310 + } else { \
  1311 + x = -x; \
  1312 + }} while (0)
  1313 +uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
  1314 +{
  1315 + neon_s8 vec;
  1316 + NEON_UNPACK(neon_s8, vec, x);
  1317 + DO_QNEG8(vec.v1);
  1318 + DO_QNEG8(vec.v2);
  1319 + DO_QNEG8(vec.v3);
  1320 + DO_QNEG8(vec.v4);
  1321 + NEON_PACK(neon_s8, x, vec);
  1322 + return x;
  1323 +}
  1324 +#undef DO_QNEG8
  1325 +
  1326 +#define DO_QABS16(x) do { \
  1327 + if (x == (int16_t)0x8000) { \
  1328 + x = 0x7fff; \
  1329 + SET_QC(); \
  1330 + } else if (x < 0) { \
  1331 + x = -x; \
  1332 + }} while (0)
  1333 +uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
  1334 +{
  1335 + neon_s16 vec;
  1336 + NEON_UNPACK(neon_s16, vec, x);
  1337 + DO_QABS16(vec.v1);
  1338 + DO_QABS16(vec.v2);
  1339 + NEON_PACK(neon_s16, x, vec);
  1340 + return x;
  1341 +}
  1342 +#undef DO_QABS16
  1343 +
  1344 +#define DO_QNEG16(x) do { \
  1345 + if (x == (int16_t)0x8000) { \
  1346 + x = 0x7fff; \
  1347 + SET_QC(); \
  1348 + } else { \
  1349 + x = -x; \
  1350 + }} while (0)
  1351 +uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
  1352 +{
  1353 + neon_s16 vec;
  1354 + NEON_UNPACK(neon_s16, vec, x);
  1355 + DO_QNEG16(vec.v1);
  1356 + DO_QNEG16(vec.v2);
  1357 + NEON_PACK(neon_s16, x, vec);
  1358 + return x;
  1359 +}
  1360 +#undef DO_QNEG16
  1361 +
  1362 +uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
  1363 +{
  1364 + if (x == SIGNBIT) {
  1365 + SET_QC();
  1366 + x = ~SIGNBIT;
  1367 + } else if ((int32_t)x < 0) {
  1368 + x = -x;
  1369 + }
  1370 + return x;
  1371 +}
  1372 +
  1373 +uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
  1374 +{
  1375 + if (x == SIGNBIT) {
  1376 + SET_QC();
  1377 + x = ~SIGNBIT;
  1378 + } else {
  1379 + x = -x;
  1380 + }
  1381 + return x;
  1382 +}
  1383 +
  1384 +/* NEON Float helpers. */
  1385 +uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
  1386 +{
  1387 + float32 f0 = vfp_itos(a);
  1388 + float32 f1 = vfp_itos(b);
  1389 + return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
  1390 +}
  1391 +
  1392 +uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
  1393 +{
  1394 + float32 f0 = vfp_itos(a);
  1395 + float32 f1 = vfp_itos(b);
  1396 + return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
  1397 +}
  1398 +
  1399 +uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
  1400 +{
  1401 + float32 f0 = vfp_itos(a);
  1402 + float32 f1 = vfp_itos(b);
  1403 + return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
  1404 + ? float32_sub(f0, f1, NFS)
  1405 + : float32_sub(f1, f0, NFS));
  1406 +}
  1407 +
  1408 +uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
  1409 +{
  1410 + return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
  1411 +}
  1412 +
  1413 +uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
  1414 +{
  1415 + return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
  1416 +}
  1417 +
  1418 +uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
  1419 +{
  1420 + return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
  1421 +}
  1422 +
  1423 +/* Floating point comparisons produce an integer result. */
  1424 +#define NEON_VOP_FCMP(name, cmp) \
  1425 +uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
  1426 +{ \
  1427 + if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
  1428 + return ~0; \
  1429 + else \
  1430 + return 0; \
  1431 +}
  1432 +
  1433 +NEON_VOP_FCMP(ceq_f32, ==)
  1434 +NEON_VOP_FCMP(cge_f32, >=)
  1435 +NEON_VOP_FCMP(cgt_f32, >)
  1436 +
  1437 +uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
  1438 +{
  1439 + float32 f0 = float32_abs(vfp_itos(a));
  1440 + float32 f1 = float32_abs(vfp_itos(b));
  1441 + return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
  1442 +}
  1443 +
  1444 +uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
  1445 +{
  1446 + float32 f0 = float32_abs(vfp_itos(a));
  1447 + float32 f1 = float32_abs(vfp_itos(b));
  1448 + return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
  1449 +}
... ...
target-arm/op.c
... ... @@ -32,7 +32,5 @@
32 32 #include "op_mem.h"
33 33 #endif
34 34  
35   -#include "op_neon.h"
36   -
37 35 /* iwMMXt support */
38 36 #include "op_iwmmxt.c"
... ...
target-arm/op_helper.c
... ... @@ -20,6 +20,9 @@
20 20 #include "exec.h"
21 21 #include "helpers.h"
22 22  
  23 +#define SIGNBIT (uint32_t)0x80000000
  24 +#define SIGNBIT64 ((uint64_t)1 << 63)
  25 +
23 26 void raise_exception(int tt)
24 27 {
25 28 env->exception_index = tt;
... ... @@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr)
116 119 }
117 120 #endif
118 121  
119   -#define SIGNBIT (uint32_t)0x80000000
  122 +/* FIXME: Pass an axplicit pointer to QF to CPUState, and move saturating
  123 + instructions into helper.c */
120 124 uint32_t HELPER(add_setq)(uint32_t a, uint32_t b)
121 125 {
122 126 uint32_t res = a + b;
... ... @@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i)
451 455 }
452 456 }
453 457  
  458 +uint64_t HELPER(neon_add_saturate_s64)(uint64_t src1, uint64_t src2)
  459 +{
  460 + uint64_t res;
  461 +
  462 + res = src1 + src2;
  463 + if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
  464 + env->QF = 1;
  465 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  466 + }
  467 + return res;
  468 +}
  469 +
  470 +uint64_t HELPER(neon_add_saturate_u64)(uint64_t src1, uint64_t src2)
  471 +{
  472 + uint64_t res;
  473 +
  474 + res = src1 + src2;
  475 + if (res < src1) {
  476 + env->QF = 1;
  477 + res = ~(uint64_t)0;
  478 + }
  479 + return res;
  480 +}
  481 +
  482 +uint64_t HELPER(neon_sub_saturate_s64)(uint64_t src1, uint64_t src2)
  483 +{
  484 + uint64_t res;
  485 +
  486 + res = src1 - src2;
  487 + if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
  488 + env->QF = 1;
  489 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  490 + }
  491 + return res;
  492 +}
  493 +
  494 +uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2)
  495 +{
  496 + uint64_t res;
  497 +
  498 + if (src1 < src2) {
  499 + env->QF = 1;
  500 + res = 0;
  501 + } else {
  502 + res = src1 - src2;
  503 + }
  504 + return res;
  505 +}
  506 +
  507 +/* These need to return a pair of value, so still use T0/T1. */
  508 +/* Transpose. Argument order is rather strange to avoid special casing
  509 + the tranlation code.
  510 + On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
  511 +void HELPER(neon_trn_u8)(void)
  512 +{
  513 + uint32_t rd;
  514 + uint32_t rm;
  515 + rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
  516 + rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
  517 + T0 = rd;
  518 + T1 = rm;
  519 + FORCE_RET();
  520 +}
  521 +
  522 +void HELPER(neon_trn_u16)(void)
  523 +{
  524 + uint32_t rd;
  525 + uint32_t rm;
  526 + rd = (T0 << 16) | (T1 & 0xffff);
  527 + rm = (T1 >> 16) | (T0 & 0xffff0000);
  528 + T0 = rd;
  529 + T1 = rm;
  530 + FORCE_RET();
  531 +}
  532 +
  533 +/* Worker routines for zip and unzip. */
  534 +void HELPER(neon_unzip_u8)(void)
  535 +{
  536 + uint32_t rd;
  537 + uint32_t rm;
  538 + rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
  539 + | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
  540 + rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
  541 + | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
  542 + T0 = rd;
  543 + T1 = rm;
  544 + FORCE_RET();
  545 +}
  546 +
  547 +void HELPER(neon_zip_u8)(void)
  548 +{
  549 + uint32_t rd;
  550 + uint32_t rm;
  551 + rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
  552 + | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
  553 + rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
  554 + | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
  555 + T0 = rd;
  556 + T1 = rm;
  557 + FORCE_RET();
  558 +}
  559 +
  560 +void HELPER(neon_zip_u16)(void)
  561 +{
  562 + uint32_t tmp;
  563 +
  564 + tmp = (T0 & 0xffff) | (T1 << 16);
  565 + T1 = (T1 & 0xffff0000) | (T0 >> 16);
  566 + T0 = tmp;
  567 + FORCE_RET();
  568 +}
... ...
target-arm/op_neon.h deleted 100644 โ†’ 0
1   -/*
2   - * ARM NEON vector operations.
3   - *
4   - * Copyright (c) 2007 CodeSourcery.
5   - * Written by Paul Brook
6   - *
7   - * This code is licenced under the GPL.
8   - */
9   -/* Note that for NEON an "l" prefix means it is a wide operation, unlike
10   - scalar arm ops where it means a word size operation. */
11   -
12   -#define SIGNBIT (uint32_t)0x80000000
13   -/* ??? NEON ops should probably have their own float status. */
14   -#define NFS &env->vfp.fp_status
15   -#define NEON_OP(name) void OPPROTO op_neon_##name (void)
16   -
17   -/* Helper routines to perform bitwise copies between float and int. */
18   -static inline float32 vfp_itos(uint32_t i)
19   -{
20   - union {
21   - uint32_t i;
22   - float32 s;
23   - } v;
24   -
25   - v.i = i;
26   - return v.s;
27   -}
28   -
29   -static inline uint32_t vfp_stoi(float32 s)
30   -{
31   - union {
32   - uint32_t i;
33   - float32 s;
34   - } v;
35   -
36   - v.s = s;
37   - return v.i;
38   -}
39   -
40   -NEON_OP(getreg_T0)
41   -{
42   - T0 = *(uint32_t *)((char *) env + PARAM1);
43   -}
44   -
45   -NEON_OP(getreg_T1)
46   -{
47   - T1 = *(uint32_t *)((char *) env + PARAM1);
48   -}
49   -
50   -NEON_OP(setreg_T0)
51   -{
52   - *(uint32_t *)((char *) env + PARAM1) = T0;
53   -}
54   -
55   -NEON_OP(setreg_T1)
56   -{
57   - *(uint32_t *)((char *) env + PARAM1) = T1;
58   -}
59   -
60   -#define NEON_TYPE1(name, type) \
61   -typedef struct \
62   -{ \
63   - type v1; \
64   -} neon_##name;
65   -#ifdef WORDS_BIGENDIAN
66   -#define NEON_TYPE2(name, type) \
67   -typedef struct \
68   -{ \
69   - type v2; \
70   - type v1; \
71   -} neon_##name;
72   -#define NEON_TYPE4(name, type) \
73   -typedef struct \
74   -{ \
75   - type v4; \
76   - type v3; \
77   - type v2; \
78   - type v1; \
79   -} neon_##name;
80   -#else
81   -#define NEON_TYPE2(name, type) \
82   -typedef struct \
83   -{ \
84   - type v1; \
85   - type v2; \
86   -} neon_##name;
87   -#define NEON_TYPE4(name, type) \
88   -typedef struct \
89   -{ \
90   - type v1; \
91   - type v2; \
92   - type v3; \
93   - type v4; \
94   -} neon_##name;
95   -#endif
96   -
97   -NEON_TYPE4(s8, int8_t)
98   -NEON_TYPE4(u8, uint8_t)
99   -NEON_TYPE2(s16, int16_t)
100   -NEON_TYPE2(u16, uint16_t)
101   -NEON_TYPE1(s32, int32_t)
102   -NEON_TYPE1(u32, uint32_t)
103   -#undef NEON_TYPE4
104   -#undef NEON_TYPE2
105   -#undef NEON_TYPE1
106   -
107   -/* Copy from a uint32_t to a vector structure type. */
108   -#define NEON_UNPACK(vtype, dest, val) do { \
109   - union { \
110   - vtype v; \
111   - uint32_t i; \
112   - } conv_u; \
113   - conv_u.i = (val); \
114   - dest = conv_u.v; \
115   - } while(0)
116   -
117   -/* Copy from a vector structure type to a uint32_t. */
118   -#define NEON_PACK(vtype, dest, val) do { \
119   - union { \
120   - vtype v; \
121   - uint32_t i; \
122   - } conv_u; \
123   - conv_u.v = (val); \
124   - dest = conv_u.i; \
125   - } while(0)
126   -
127   -#define NEON_DO1 \
128   - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
129   -#define NEON_DO2 \
130   - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
131   - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
132   -#define NEON_DO4 \
133   - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
134   - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
135   - NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
136   - NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
137   -
138   -#define NEON_VOP(name, vtype, n) \
139   -NEON_OP(name) \
140   -{ \
141   - vtype vsrc1; \
142   - vtype vsrc2; \
143   - vtype vdest; \
144   - NEON_UNPACK(vtype, vsrc1, T0); \
145   - NEON_UNPACK(vtype, vsrc2, T1); \
146   - NEON_DO##n; \
147   - NEON_PACK(vtype, T0, vdest); \
148   - FORCE_RET(); \
149   -}
150   -
151   -#define NEON_VOP1(name, vtype, n) \
152   -NEON_OP(name) \
153   -{ \
154   - vtype vsrc1; \
155   - vtype vdest; \
156   - NEON_UNPACK(vtype, vsrc1, T0); \
157   - NEON_DO##n; \
158   - NEON_PACK(vtype, T0, vdest); \
159   - FORCE_RET(); \
160   -}
161   -
162   -/* Pairwise operations. */
163   -/* For 32-bit elements each segment only contains a single element, so
164   - the elementwise and pairwise operations are the same. */
165   -#define NEON_PDO2 \
166   - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
167   - NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
168   -#define NEON_PDO4 \
169   - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
170   - NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
171   - NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
172   - NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
173   -
174   -#define NEON_POP(name, vtype, n) \
175   -NEON_OP(name) \
176   -{ \
177   - vtype vsrc1; \
178   - vtype vsrc2; \
179   - vtype vdest; \
180   - NEON_UNPACK(vtype, vsrc1, T0); \
181   - NEON_UNPACK(vtype, vsrc2, T1); \
182   - NEON_PDO##n; \
183   - NEON_PACK(vtype, T0, vdest); \
184   - FORCE_RET(); \
185   -}
186   -
187   -#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
188   -NEON_VOP(hadd_s8, neon_s8, 4)
189   -NEON_VOP(hadd_u8, neon_u8, 4)
190   -NEON_VOP(hadd_s16, neon_s16, 2)
191   -NEON_VOP(hadd_u16, neon_u16, 2)
192   -#undef NEON_FN
193   -
194   -NEON_OP(hadd_s32)
195   -{
196   - int32_t src1 = T0;
197   - int32_t src2 = T1;
198   - int32_t dest;
199   -
200   - dest = (src1 >> 1) + (src2 >> 1);
201   - if (src1 & src2 & 1)
202   - dest++;
203   - T0 = dest;
204   - FORCE_RET();
205   -}
206   -
207   -NEON_OP(hadd_u32)
208   -{
209   - uint32_t src1 = T0;
210   - uint32_t src2 = T1;
211   - uint32_t dest;
212   -
213   - dest = (src1 >> 1) + (src2 >> 1);
214   - if (src1 & src2 & 1)
215   - dest++;
216   - T0 = dest;
217   - FORCE_RET();
218   -}
219   -
220   -#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
221   -NEON_VOP(rhadd_s8, neon_s8, 4)
222   -NEON_VOP(rhadd_u8, neon_u8, 4)
223   -NEON_VOP(rhadd_s16, neon_s16, 2)
224   -NEON_VOP(rhadd_u16, neon_u16, 2)
225   -#undef NEON_FN
226   -
227   -NEON_OP(rhadd_s32)
228   -{
229   - int32_t src1 = T0;
230   - int32_t src2 = T1;
231   - int32_t dest;
232   -
233   - dest = (src1 >> 1) + (src2 >> 1);
234   - if ((src1 | src2) & 1)
235   - dest++;
236   - T0 = dest;
237   - FORCE_RET();
238   -}
239   -
240   -NEON_OP(rhadd_u32)
241   -{
242   - uint32_t src1 = T0;
243   - uint32_t src2 = T1;
244   - uint32_t dest;
245   -
246   - dest = (src1 >> 1) + (src2 >> 1);
247   - if ((src1 | src2) & 1)
248   - dest++;
249   - T0 = dest;
250   - FORCE_RET();
251   -}
252   -
253   -#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
254   -NEON_VOP(hsub_s8, neon_s8, 4)
255   -NEON_VOP(hsub_u8, neon_u8, 4)
256   -NEON_VOP(hsub_s16, neon_s16, 2)
257   -NEON_VOP(hsub_u16, neon_u16, 2)
258   -#undef NEON_FN
259   -
260   -NEON_OP(hsub_s32)
261   -{
262   - int32_t src1 = T0;
263   - int32_t src2 = T1;
264   - int32_t dest;
265   -
266   - dest = (src1 >> 1) - (src2 >> 1);
267   - if ((~src1) & src2 & 1)
268   - dest--;
269   - T0 = dest;
270   - FORCE_RET();
271   -}
272   -
273   -NEON_OP(hsub_u32)
274   -{
275   - uint32_t src1 = T0;
276   - uint32_t src2 = T1;
277   - uint32_t dest;
278   -
279   - dest = (src1 >> 1) - (src2 >> 1);
280   - if ((~src1) & src2 & 1)
281   - dest--;
282   - T0 = dest;
283   - FORCE_RET();
284   -}
285   -
286   -#define NEON_USAT(dest, src1, src2, type) do { \
287   - uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
288   - if (tmp != (type)tmp) { \
289   - env->QF = 1; \
290   - dest = ~0; \
291   - } else { \
292   - dest = tmp; \
293   - }} while(0)
294   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
295   -NEON_VOP(qadd_u8, neon_u8, 4)
296   -#undef NEON_FN
297   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
298   -NEON_VOP(qadd_u16, neon_u16, 2)
299   -#undef NEON_FN
300   -#undef NEON_USAT
301   -
302   -#define NEON_SSAT(dest, src1, src2, type) do { \
303   - int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
304   - if (tmp != (type)tmp) { \
305   - env->QF = 1; \
306   - if (src2 > 0) { \
307   - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
308   - } else { \
309   - tmp = 1 << (sizeof(type) * 8 - 1); \
310   - } \
311   - } \
312   - dest = tmp; \
313   - } while(0)
314   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
315   -NEON_VOP(qadd_s8, neon_s8, 4)
316   -#undef NEON_FN
317   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
318   -NEON_VOP(qadd_s16, neon_s16, 2)
319   -#undef NEON_FN
320   -#undef NEON_SSAT
321   -
322   -#define NEON_USAT(dest, src1, src2, type) do { \
323   - uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
324   - if (tmp != (type)tmp) { \
325   - env->QF = 1; \
326   - dest = 0; \
327   - } else { \
328   - dest = tmp; \
329   - }} while(0)
330   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
331   -NEON_VOP(qsub_u8, neon_u8, 4)
332   -#undef NEON_FN
333   -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
334   -NEON_VOP(qsub_u16, neon_u16, 2)
335   -#undef NEON_FN
336   -#undef NEON_USAT
337   -
338   -#define NEON_SSAT(dest, src1, src2, type) do { \
339   - int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
340   - if (tmp != (type)tmp) { \
341   - env->QF = 1; \
342   - if (src2 < 0) { \
343   - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
344   - } else { \
345   - tmp = 1 << (sizeof(type) * 8 - 1); \
346   - } \
347   - } \
348   - dest = tmp; \
349   - } while(0)
350   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
351   -NEON_VOP(qsub_s8, neon_s8, 4)
352   -#undef NEON_FN
353   -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
354   -NEON_VOP(qsub_s16, neon_s16, 2)
355   -#undef NEON_FN
356   -#undef NEON_SSAT
357   -
358   -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
359   -NEON_VOP(cgt_s8, neon_s8, 4)
360   -NEON_VOP(cgt_u8, neon_u8, 4)
361   -NEON_VOP(cgt_s16, neon_s16, 2)
362   -NEON_VOP(cgt_u16, neon_u16, 2)
363   -NEON_VOP(cgt_s32, neon_s32, 1)
364   -NEON_VOP(cgt_u32, neon_u32, 1)
365   -#undef NEON_FN
366   -
367   -#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
368   -NEON_VOP(cge_s8, neon_s8, 4)
369   -NEON_VOP(cge_u8, neon_u8, 4)
370   -NEON_VOP(cge_s16, neon_s16, 2)
371   -NEON_VOP(cge_u16, neon_u16, 2)
372   -NEON_VOP(cge_s32, neon_s32, 1)
373   -NEON_VOP(cge_u32, neon_u32, 1)
374   -#undef NEON_FN
375   -
376   -#define NEON_FN(dest, src1, src2) do { \
377   - int8_t tmp; \
378   - tmp = (int8_t)src2; \
379   - if (tmp < 0) { \
380   - dest = src1 >> -tmp; \
381   - } else { \
382   - dest = src1 << tmp; \
383   - }} while (0)
384   -NEON_VOP(shl_s8, neon_s8, 4)
385   -NEON_VOP(shl_u8, neon_u8, 4)
386   -NEON_VOP(shl_s16, neon_s16, 2)
387   -NEON_VOP(shl_u16, neon_u16, 2)
388   -NEON_VOP(shl_s32, neon_s32, 1)
389   -NEON_VOP(shl_u32, neon_u32, 1)
390   -#undef NEON_FN
391   -
392   -NEON_OP(shl_u64)
393   -{
394   - int8_t shift = env->vfp.scratch[0];
395   - uint64_t val = T0 | ((uint64_t)T1 << 32);
396   - if (shift < 0) {
397   - val >>= -shift;
398   - } else {
399   - val <<= shift;
400   - }
401   - T0 = val;
402   - T1 = val >> 32;
403   - FORCE_RET();
404   -}
405   -
406   -NEON_OP(shl_s64)
407   -{
408   - int8_t shift = env->vfp.scratch[0];
409   - int64_t val = T0 | ((uint64_t)T1 << 32);
410   - if (shift < 0) {
411   - val >>= -shift;
412   - } else {
413   - val <<= shift;
414   - }
415   - T0 = val;
416   - T1 = val >> 32;
417   - FORCE_RET();
418   -}
419   -
420   -#define NEON_FN(dest, src1, src2) do { \
421   - int8_t tmp; \
422   - tmp = (int8_t)src1; \
423   - if (tmp < 0) { \
424   - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
425   - } else { \
426   - dest = src2 << tmp; \
427   - }} while (0)
428   -
429   -NEON_VOP(rshl_s8, neon_s8, 4)
430   -NEON_VOP(rshl_u8, neon_u8, 4)
431   -NEON_VOP(rshl_s16, neon_s16, 2)
432   -NEON_VOP(rshl_u16, neon_u16, 2)
433   -NEON_VOP(rshl_s32, neon_s32, 1)
434   -NEON_VOP(rshl_u32, neon_u32, 1)
435   -#undef NEON_FN
436   -
437   -NEON_OP(rshl_u64)
438   -{
439   - int8_t shift = env->vfp.scratch[0];
440   - uint64_t val = T0 | ((uint64_t)T1 << 32);
441   - if (shift < 0) {
442   - val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
443   - val >>= -shift;
444   - } else {
445   - val <<= shift;
446   - }
447   - T0 = val;
448   - T1 = val >> 32;
449   - FORCE_RET();
450   -}
451   -
452   -NEON_OP(rshl_s64)
453   -{
454   - int8_t shift = env->vfp.scratch[0];
455   - int64_t val = T0 | ((uint64_t)T1 << 32);
456   - if (shift < 0) {
457   - val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
458   - } else {
459   - val <<= shift;
460   - }
461   - T0 = val;
462   - T1 = val >> 32;
463   - FORCE_RET();
464   -}
465   -
466   -#define NEON_FN(dest, src1, src2) do { \
467   - int8_t tmp; \
468   - tmp = (int8_t)src1; \
469   - if (tmp < 0) { \
470   - dest = src2 >> -tmp; \
471   - } else { \
472   - dest = src2 << tmp; \
473   - if ((dest >> tmp) != src2) { \
474   - env->QF = 1; \
475   - dest = ~0; \
476   - } \
477   - }} while (0)
478   -NEON_VOP(qshl_s8, neon_s8, 4)
479   -NEON_VOP(qshl_s16, neon_s16, 2)
480   -NEON_VOP(qshl_s32, neon_s32, 1)
481   -#undef NEON_FN
482   -
483   -NEON_OP(qshl_s64)
484   -{
485   - int8_t shift = env->vfp.scratch[0];
486   - int64_t val = T0 | ((uint64_t)T1 << 32);
487   - if (shift < 0) {
488   - val >>= -shift;
489   - } else {
490   - int64_t tmp = val;
491   - val <<= shift;
492   - if ((val >> shift) != tmp) {
493   - env->QF = 1;
494   - val = (tmp >> 63) ^ 0x7fffffffffffffffULL;
495   - }
496   - }
497   - T0 = val;
498   - T1 = val >> 32;
499   - FORCE_RET();
500   -}
501   -
502   -#define NEON_FN(dest, src1, src2) do { \
503   - int8_t tmp; \
504   - tmp = (int8_t)src1; \
505   - if (tmp < 0) { \
506   - dest = src2 >> -tmp; \
507   - } else { \
508   - dest = src2 << tmp; \
509   - if ((dest >> tmp) != src2) { \
510   - env->QF = 1; \
511   - dest = src2 >> 31; \
512   - } \
513   - }} while (0)
514   -NEON_VOP(qshl_u8, neon_u8, 4)
515   -NEON_VOP(qshl_u16, neon_u16, 2)
516   -NEON_VOP(qshl_u32, neon_u32, 1)
517   -#undef NEON_FN
518   -
519   -NEON_OP(qshl_u64)
520   -{
521   - int8_t shift = env->vfp.scratch[0];
522   - uint64_t val = T0 | ((uint64_t)T1 << 32);
523   - if (shift < 0) {
524   - val >>= -shift;
525   - } else {
526   - uint64_t tmp = val;
527   - val <<= shift;
528   - if ((val >> shift) != tmp) {
529   - env->QF = 1;
530   - val = ~(uint64_t)0;
531   - }
532   - }
533   - T0 = val;
534   - T1 = val >> 32;
535   - FORCE_RET();
536   -}
537   -
538   -#define NEON_FN(dest, src1, src2) do { \
539   - int8_t tmp; \
540   - tmp = (int8_t)src1; \
541   - if (tmp < 0) { \
542   - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
543   - } else { \
544   - dest = src2 << tmp; \
545   - if ((dest >> tmp) != src2) { \
546   - dest = ~0; \
547   - } \
548   - }} while (0)
549   -NEON_VOP(qrshl_s8, neon_s8, 4)
550   -NEON_VOP(qrshl_s16, neon_s16, 2)
551   -NEON_VOP(qrshl_s32, neon_s32, 1)
552   -#undef NEON_FN
553   -
554   -#define NEON_FN(dest, src1, src2) do { \
555   - int8_t tmp; \
556   - tmp = (int8_t)src1; \
557   - if (tmp < 0) { \
558   - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
559   - } else { \
560   - dest = src2 << tmp; \
561   - if ((dest >> tmp) != src2) { \
562   - env->QF = 1; \
563   - dest = src2 >> 31; \
564   - } \
565   - }} while (0)
566   -NEON_VOP(qrshl_u8, neon_u8, 4)
567   -NEON_VOP(qrshl_u16, neon_u16, 2)
568   -NEON_VOP(qrshl_u32, neon_u32, 1)
569   -#undef NEON_FN
570   -
571   -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
572   -NEON_VOP(max_s8, neon_s8, 4)
573   -NEON_VOP(max_u8, neon_u8, 4)
574   -NEON_VOP(max_s16, neon_s16, 2)
575   -NEON_VOP(max_u16, neon_u16, 2)
576   -NEON_VOP(max_s32, neon_s32, 1)
577   -NEON_VOP(max_u32, neon_u32, 1)
578   -NEON_POP(pmax_s8, neon_s8, 4)
579   -NEON_POP(pmax_u8, neon_u8, 4)
580   -NEON_POP(pmax_s16, neon_s16, 2)
581   -NEON_POP(pmax_u16, neon_u16, 2)
582   -#undef NEON_FN
583   -
584   -NEON_OP(max_f32)
585   -{
586   - float32 f0 = vfp_itos(T0);
587   - float32 f1 = vfp_itos(T1);
588   - T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;
589   - FORCE_RET();
590   -}
591   -
592   -#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
593   -NEON_VOP(min_s8, neon_s8, 4)
594   -NEON_VOP(min_u8, neon_u8, 4)
595   -NEON_VOP(min_s16, neon_s16, 2)
596   -NEON_VOP(min_u16, neon_u16, 2)
597   -NEON_VOP(min_s32, neon_s32, 1)
598   -NEON_VOP(min_u32, neon_u32, 1)
599   -NEON_POP(pmin_s8, neon_s8, 4)
600   -NEON_POP(pmin_u8, neon_u8, 4)
601   -NEON_POP(pmin_s16, neon_s16, 2)
602   -NEON_POP(pmin_u16, neon_u16, 2)
603   -#undef NEON_FN
604   -
605   -NEON_OP(min_f32)
606   -{
607   - float32 f0 = vfp_itos(T0);
608   - float32 f1 = vfp_itos(T1);
609   - T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;
610   - FORCE_RET();
611   -}
612   -
613   -#define NEON_FN(dest, src1, src2) \
614   - dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
615   -NEON_VOP(abd_s8, neon_s8, 4)
616   -NEON_VOP(abd_u8, neon_u8, 4)
617   -NEON_VOP(abd_s16, neon_s16, 2)
618   -NEON_VOP(abd_u16, neon_u16, 2)
619   -NEON_VOP(abd_s32, neon_s32, 1)
620   -NEON_VOP(abd_u32, neon_u32, 1)
621   -#undef NEON_FN
622   -
623   -NEON_OP(abd_f32)
624   -{
625   - float32 f0 = vfp_itos(T0);
626   - float32 f1 = vfp_itos(T1);
627   - T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
628   - ? float32_sub(f0, f1, NFS)
629   - : float32_sub(f1, f0, NFS));
630   - FORCE_RET();
631   -}
632   -
633   -#define NEON_FN(dest, src1, src2) dest = src1 + src2
634   -NEON_VOP(add_u8, neon_u8, 4)
635   -NEON_VOP(add_u16, neon_u16, 2)
636   -NEON_POP(padd_u8, neon_u8, 4)
637   -NEON_POP(padd_u16, neon_u16, 2)
638   -#undef NEON_FN
639   -
640   -NEON_OP(add_f32)
641   -{
642   - T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));
643   - FORCE_RET();
644   -}
645   -
646   -#define NEON_FN(dest, src1, src2) dest = src1 - src2
647   -NEON_VOP(sub_u8, neon_u8, 4)
648   -NEON_VOP(sub_u16, neon_u16, 2)
649   -#undef NEON_FN
650   -
651   -NEON_OP(sub_f32)
652   -{
653   - T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));
654   - FORCE_RET();
655   -}
656   -
657   -#define NEON_FN(dest, src1, src2) dest = src2 - src1
658   -NEON_VOP(rsb_u8, neon_u8, 4)
659   -NEON_VOP(rsb_u16, neon_u16, 2)
660   -#undef NEON_FN
661   -
662   -NEON_OP(rsb_f32)
663   -{
664   - T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));
665   - FORCE_RET();
666   -}
667   -
668   -#define NEON_FN(dest, src1, src2) dest = src1 * src2
669   -NEON_VOP(mul_u8, neon_u8, 4)
670   -NEON_VOP(mul_u16, neon_u16, 2)
671   -#undef NEON_FN
672   -
673   -NEON_OP(mul_f32)
674   -{
675   - T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));
676   - FORCE_RET();
677   -}
678   -
679   -NEON_OP(mul_p8)
680   -{
681   - T0 = helper_neon_mul_p8(T0, T1);
682   -}
683   -
684   -#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
685   -NEON_VOP(tst_u8, neon_u8, 4)
686   -NEON_VOP(tst_u16, neon_u16, 2)
687   -NEON_VOP(tst_u32, neon_u32, 1)
688   -#undef NEON_FN
689   -
690   -#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
691   -NEON_VOP(ceq_u8, neon_u8, 4)
692   -NEON_VOP(ceq_u16, neon_u16, 2)
693   -NEON_VOP(ceq_u32, neon_u32, 1)
694   -#undef NEON_FN
695   -
696   -#define NEON_QDMULH16(dest, src1, src2, round) do { \
697   - uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
698   - if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
699   - env->QF = 1; \
700   - tmp = (tmp >> 31) ^ ~SIGNBIT; \
701   - } \
702   - tmp <<= 1; \
703   - if (round) { \
704   - int32_t old = tmp; \
705   - tmp += 1 << 15; \
706   - if ((int32_t)tmp < old) { \
707   - env->QF = 1; \
708   - tmp = SIGNBIT - 1; \
709   - } \
710   - } \
711   - dest = tmp >> 16; \
712   - } while(0)
713   -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
714   -NEON_VOP(qdmulh_s16, neon_s16, 2)
715   -#undef NEON_FN
716   -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
717   -NEON_VOP(qrdmulh_s16, neon_s16, 2)
718   -#undef NEON_FN
719   -#undef NEON_QDMULH16
720   -
721   -#define SIGNBIT64 ((uint64_t)1 << 63)
722   -#define NEON_QDMULH32(dest, src1, src2, round) do { \
723   - uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
724   - if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
725   - env->QF = 1; \
726   - tmp = (tmp >> 63) ^ ~SIGNBIT64; \
727   - } else { \
728   - tmp <<= 1; \
729   - } \
730   - if (round) { \
731   - int64_t old = tmp; \
732   - tmp += (int64_t)1 << 31; \
733   - if ((int64_t)tmp < old) { \
734   - env->QF = 1; \
735   - tmp = SIGNBIT64 - 1; \
736   - } \
737   - } \
738   - dest = tmp >> 32; \
739   - } while(0)
740   -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
741   -NEON_VOP(qdmulh_s32, neon_s32, 1)
742   -#undef NEON_FN
743   -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
744   -NEON_VOP(qrdmulh_s32, neon_s32, 1)
745   -#undef NEON_FN
746   -#undef NEON_QDMULH32
747   -
748   -/* Floating point comparisons produce an integer result. */
749   -#define NEON_VOP_FCMP(name, cmp) \
750   -NEON_OP(name) \
751   -{ \
752   - if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \
753   - T0 = -1; \
754   - else \
755   - T0 = 0; \
756   - FORCE_RET(); \
757   -}
758   -
759   -NEON_VOP_FCMP(ceq_f32, ==)
760   -NEON_VOP_FCMP(cge_f32, >=)
761   -NEON_VOP_FCMP(cgt_f32, >)
762   -
763   -NEON_OP(acge_f32)
764   -{
765   - float32 f0 = float32_abs(vfp_itos(T0));
766   - float32 f1 = float32_abs(vfp_itos(T1));
767   - T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;
768   - FORCE_RET();
769   -}
770   -
771   -NEON_OP(acgt_f32)
772   -{
773   - float32 f0 = float32_abs(vfp_itos(T0));
774   - float32 f1 = float32_abs(vfp_itos(T1));
775   - T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;
776   - FORCE_RET();
777   -}
778   -
779   -/* Narrowing instructions. The named type is the destination type. */
780   -NEON_OP(narrow_u8)
781   -{
782   - T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
783   - | ((T1 << 16) & 0xff0000) | (T1 << 24);
784   - FORCE_RET();
785   -}
786   -
787   -NEON_OP(narrow_sat_u8)
788   -{
789   - neon_u16 src;
790   - neon_u8 dest;
791   -#define SAT8(d, s) \
792   - if (s > 0xff) { \
793   - d = 0xff; \
794   - env->QF = 1; \
795   - } else { \
796   - d = s; \
797   - }
798   -
799   - NEON_UNPACK(neon_u16, src, T0);
800   - SAT8(dest.v1, src.v1);
801   - SAT8(dest.v2, src.v2);
802   - NEON_UNPACK(neon_u16, src, T1);
803   - SAT8(dest.v3, src.v1);
804   - SAT8(dest.v4, src.v2);
805   - NEON_PACK(neon_u8, T0, dest);
806   - FORCE_RET();
807   -#undef SAT8
808   -}
809   -
810   -NEON_OP(narrow_sat_s8)
811   -{
812   - neon_s16 src;
813   - neon_s8 dest;
814   -#define SAT8(d, s) \
815   - if (s != (uint8_t)s) { \
816   - d = (s >> 15) ^ 0x7f; \
817   - env->QF = 1; \
818   - } else { \
819   - d = s; \
820   - }
821   -
822   - NEON_UNPACK(neon_s16, src, T0);
823   - SAT8(dest.v1, src.v1);
824   - SAT8(dest.v2, src.v2);
825   - NEON_UNPACK(neon_s16, src, T1);
826   - SAT8(dest.v3, src.v1);
827   - SAT8(dest.v4, src.v2);
828   - NEON_PACK(neon_s8, T0, dest);
829   - FORCE_RET();
830   -#undef SAT8
831   -}
832   -
833   -NEON_OP(narrow_u16)
834   -{
835   - T0 = (T0 & 0xffff) | (T1 << 16);
836   -}
837   -
838   -NEON_OP(narrow_sat_u16)
839   -{
840   - if (T0 > 0xffff) {
841   - T0 = 0xffff;
842   - env->QF = 1;
843   - }
844   - if (T1 > 0xffff) {
845   - T1 = 0xffff;
846   - env->QF = 1;
847   - }
848   - T0 |= T1 << 16;
849   - FORCE_RET();
850   -}
851   -
852   -NEON_OP(narrow_sat_s16)
853   -{
854   - if ((int32_t)T0 != (int16_t)T0) {
855   - T0 = ((int32_t)T0 >> 31) ^ 0x7fff;
856   - env->QF = 1;
857   - }
858   - if ((int32_t)T1 != (int16_t) T1) {
859   - T1 = ((int32_t)T1 >> 31) ^ 0x7fff;
860   - env->QF = 1;
861   - }
862   - T0 = (uint16_t)T0 | (T1 << 16);
863   - FORCE_RET();
864   -}
865   -
866   -NEON_OP(narrow_sat_u32)
867   -{
868   - if (T1) {
869   - T0 = 0xffffffffu;
870   - env->QF = 1;
871   - }
872   - FORCE_RET();
873   -}
874   -
875   -NEON_OP(narrow_sat_s32)
876   -{
877   - int32_t sign = (int32_t)T1 >> 31;
878   -
879   - if ((int32_t)T1 != sign) {
880   - T0 = sign ^ 0x7fffffff;
881   - env->QF = 1;
882   - }
883   - FORCE_RET();
884   -}
885   -
886   -/* Narrowing instructions. Named type is the narrow type. */
887   -NEON_OP(narrow_high_u8)
888   -{
889   - T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
890   - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
891   - FORCE_RET();
892   -}
893   -
894   -NEON_OP(narrow_high_u16)
895   -{
896   - T0 = (T0 >> 16) | (T1 & 0xffff0000);
897   - FORCE_RET();
898   -}
899   -
900   -NEON_OP(narrow_high_round_u8)
901   -{
902   - T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)
903   - | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);
904   - FORCE_RET();
905   -}
906   -
907   -NEON_OP(narrow_high_round_u16)
908   -{
909   - T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);
910   - FORCE_RET();
911   -}
912   -
913   -NEON_OP(narrow_high_round_u32)
914   -{
915   - if (T0 >= 0x80000000u)
916   - T0 = T1 + 1;
917   - else
918   - T0 = T1;
919   - FORCE_RET();
920   -}
921   -
922   -/* Widening instructions. Named type is source type. */
923   -NEON_OP(widen_s8)
924   -{
925   - uint32_t src;
926   -
927   - src = T0;
928   - T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);
929   - T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);
930   -}
931   -
932   -NEON_OP(widen_u8)
933   -{
934   - T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);
935   - T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);
936   -}
937   -
938   -NEON_OP(widen_s16)
939   -{
940   - int32_t src;
941   -
942   - src = T0;
943   - T0 = (int16_t)src;
944   - T1 = src >> 16;
945   -}
946   -
947   -NEON_OP(widen_u16)
948   -{
949   - T1 = T0 >> 16;
950   - T0 &= 0xffff;
951   -}
952   -
953   -NEON_OP(widen_s32)
954   -{
955   - T1 = (int32_t)T0 >> 31;
956   - FORCE_RET();
957   -}
958   -
959   -NEON_OP(widen_high_u8)
960   -{
961   - T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);
962   - T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);
963   -}
964   -
965   -NEON_OP(widen_high_u16)
966   -{
967   - T1 = T0 & 0xffff0000;
968   - T0 <<= 16;
969   -}
970   -
971   -/* Long operations. The type is the wide type. */
972   -NEON_OP(shll_u16)
973   -{
974   - int shift = PARAM1;
975   - uint32_t mask;
976   -
977   - mask = 0xffff >> (16 - shift);
978   - mask |= mask << 16;
979   - mask = ~mask;
980   -
981   - T0 = (T0 << shift) & mask;
982   - T1 = (T1 << shift) & mask;
983   - FORCE_RET();
984   -}
985   -
986   -NEON_OP(shll_u64)
987   -{
988   - int shift = PARAM1;
989   -
990   - T1 <<= shift;
991   - T1 |= T0 >> (32 - shift);
992   - T0 <<= shift;
993   - FORCE_RET();
994   -}
995   -
996   -NEON_OP(addl_u16)
997   -{
998   - uint32_t tmp;
999   - uint32_t high;
1000   -
1001   - tmp = env->vfp.scratch[0];
1002   - high = (T0 >> 16) + (tmp >> 16);
1003   - T0 = (uint16_t)(T0 + tmp);
1004   - T0 |= (high << 16);
1005   - tmp = env->vfp.scratch[1];
1006   - high = (T1 >> 16) + (tmp >> 16);
1007   - T1 = (uint16_t)(T1 + tmp);
1008   - T1 |= (high << 16);
1009   - FORCE_RET();
1010   -}
1011   -
1012   -NEON_OP(addl_u32)
1013   -{
1014   - T0 += env->vfp.scratch[0];
1015   - T1 += env->vfp.scratch[1];
1016   - FORCE_RET();
1017   -}
1018   -
1019   -NEON_OP(addl_u64)
1020   -{
1021   - uint64_t tmp;
1022   - tmp = T0 | ((uint64_t)T1 << 32);
1023   - tmp += env->vfp.scratch[0];
1024   - tmp += (uint64_t)env->vfp.scratch[1] << 32;
1025   - T0 = tmp;
1026   - T1 = tmp >> 32;
1027   - FORCE_RET();
1028   -}
1029   -
1030   -NEON_OP(subl_u16)
1031   -{
1032   - uint32_t tmp;
1033   - uint32_t high;
1034   -
1035   - tmp = env->vfp.scratch[0];
1036   - high = (T0 >> 16) - (tmp >> 16);
1037   - T0 = (uint16_t)(T0 - tmp);
1038   - T0 |= (high << 16);
1039   - tmp = env->vfp.scratch[1];
1040   - high = (T1 >> 16) - (tmp >> 16);
1041   - T1 = (uint16_t)(T1 - tmp);
1042   - T1 |= (high << 16);
1043   - FORCE_RET();
1044   -}
1045   -
1046   -NEON_OP(subl_u32)
1047   -{
1048   - T0 -= env->vfp.scratch[0];
1049   - T1 -= env->vfp.scratch[1];
1050   - FORCE_RET();
1051   -}
1052   -
1053   -NEON_OP(subl_u64)
1054   -{
1055   - uint64_t tmp;
1056   - tmp = T0 | ((uint64_t)T1 << 32);
1057   - tmp -= env->vfp.scratch[0];
1058   - tmp -= (uint64_t)env->vfp.scratch[1] << 32;
1059   - T0 = tmp;
1060   - T1 = tmp >> 32;
1061   - FORCE_RET();
1062   -}
1063   -
1064   -#define DO_ABD(dest, x, y, type) do { \
1065   - type tmp_x = x; \
1066   - type tmp_y = y; \
1067   - dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1068   - } while(0)
1069   -
1070   -NEON_OP(abdl_u16)
1071   -{
1072   - uint32_t tmp;
1073   - uint32_t low;
1074   - uint32_t high;
1075   -
1076   - DO_ABD(low, T0, T1, uint8_t);
1077   - DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);
1078   - low |= tmp << 16;
1079   - DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);
1080   - DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);
1081   - high |= tmp << 16;
1082   - T0 = low;
1083   - T1 = high;
1084   - FORCE_RET();
1085   -}
1086   -
1087   -NEON_OP(abdl_s16)
1088   -{
1089   - uint32_t tmp;
1090   - uint32_t low;
1091   - uint32_t high;
1092   -
1093   - DO_ABD(low, T0, T1, int8_t);
1094   - DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);
1095   - low |= tmp << 16;
1096   - DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);
1097   - DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);
1098   - high |= tmp << 16;
1099   - T0 = low;
1100   - T1 = high;
1101   - FORCE_RET();
1102   -}
1103   -
1104   -NEON_OP(abdl_u32)
1105   -{
1106   - uint32_t low;
1107   - uint32_t high;
1108   -
1109   - DO_ABD(low, T0, T1, uint16_t);
1110   - DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);
1111   - T0 = low;
1112   - T1 = high;
1113   - FORCE_RET();
1114   -}
1115   -
1116   -NEON_OP(abdl_s32)
1117   -{
1118   - uint32_t low;
1119   - uint32_t high;
1120   -
1121   - DO_ABD(low, T0, T1, int16_t);
1122   - DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);
1123   - T0 = low;
1124   - T1 = high;
1125   - FORCE_RET();
1126   -}
1127   -
1128   -NEON_OP(abdl_u64)
1129   -{
1130   - DO_ABD(T0, T0, T1, uint32_t);
1131   - T1 = 0;
1132   -}
1133   -
1134   -NEON_OP(abdl_s64)
1135   -{
1136   - DO_ABD(T0, T0, T1, int32_t);
1137   - T1 = 0;
1138   -}
1139   -#undef DO_ABD
1140   -
1141   -/* Widening multiple. Named type is the source type. */
1142   -#define DO_MULL(dest, x, y, type1, type2) do { \
1143   - type1 tmp_x = x; \
1144   - type1 tmp_y = y; \
1145   - dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1146   - } while(0)
1147   -
1148   -NEON_OP(mull_u8)
1149   -{
1150   - uint32_t tmp;
1151   - uint32_t low;
1152   - uint32_t high;
1153   -
1154   - DO_MULL(low, T0, T1, uint8_t, uint16_t);
1155   - DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);
1156   - low |= tmp << 16;
1157   - DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);
1158   - DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);
1159   - high |= tmp << 16;
1160   - T0 = low;
1161   - T1 = high;
1162   - FORCE_RET();
1163   -}
1164   -
1165   -NEON_OP(mull_s8)
1166   -{
1167   - uint32_t tmp;
1168   - uint32_t low;
1169   - uint32_t high;
1170   -
1171   - DO_MULL(low, T0, T1, int8_t, uint16_t);
1172   - DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);
1173   - low |= tmp << 16;
1174   - DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);
1175   - DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);
1176   - high |= tmp << 16;
1177   - T0 = low;
1178   - T1 = high;
1179   - FORCE_RET();
1180   -}
1181   -
1182   -NEON_OP(mull_u16)
1183   -{
1184   - uint32_t low;
1185   - uint32_t high;
1186   -
1187   - DO_MULL(low, T0, T1, uint16_t, uint32_t);
1188   - DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);
1189   - T0 = low;
1190   - T1 = high;
1191   - FORCE_RET();
1192   -}
1193   -
1194   -NEON_OP(mull_s16)
1195   -{
1196   - uint32_t low;
1197   - uint32_t high;
1198   -
1199   - DO_MULL(low, T0, T1, int16_t, uint32_t);
1200   - DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);
1201   - T0 = low;
1202   - T1 = high;
1203   - FORCE_RET();
1204   -}
1205   -
1206   -NEON_OP(addl_saturate_s32)
1207   -{
1208   - uint32_t tmp;
1209   - uint32_t res;
1210   -
1211   - tmp = env->vfp.scratch[0];
1212   - res = T0 + tmp;
1213   - if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {
1214   - env->QF = 1;
1215   - T0 = (T0 >> 31) ^ 0x7fffffff;
1216   - } else {
1217   - T0 = res;
1218   - }
1219   - tmp = env->vfp.scratch[1];
1220   - res = T1 + tmp;
1221   - if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {
1222   - env->QF = 1;
1223   - T1 = (T1 >> 31) ^ 0x7fffffff;
1224   - } else {
1225   - T1 = res;
1226   - }
1227   - FORCE_RET();
1228   -}
1229   -
1230   -NEON_OP(addl_saturate_s64)
1231   -{
1232   - uint64_t src1;
1233   - uint64_t src2;
1234   - uint64_t res;
1235   -
1236   - src1 = T0 + ((uint64_t)T1 << 32);
1237   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1238   - res = src1 + src2;
1239   - if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
1240   - env->QF = 1;
1241   - T0 = ~(int64_t)src1 >> 63;
1242   - T1 = T0 ^ 0x80000000;
1243   - } else {
1244   - T0 = res;
1245   - T1 = res >> 32;
1246   - }
1247   - FORCE_RET();
1248   -}
1249   -
1250   -NEON_OP(addl_saturate_u64)
1251   -{
1252   - uint64_t src1;
1253   - uint64_t src2;
1254   - uint64_t res;
1255   -
1256   - src1 = T0 + ((uint64_t)T1 << 32);
1257   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1258   - res = src1 + src2;
1259   - if (res < src1) {
1260   - env->QF = 1;
1261   - T0 = 0xffffffff;
1262   - T1 = 0xffffffff;
1263   - } else {
1264   - T0 = res;
1265   - T1 = res >> 32;
1266   - }
1267   - FORCE_RET();
1268   -}
1269   -
1270   -NEON_OP(subl_saturate_s64)
1271   -{
1272   - uint64_t src1;
1273   - uint64_t src2;
1274   - uint64_t res;
1275   -
1276   - src1 = T0 + ((uint64_t)T1 << 32);
1277   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1278   - res = src1 - src2;
1279   - if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
1280   - env->QF = 1;
1281   - T0 = ~(int64_t)src1 >> 63;
1282   - T1 = T0 ^ 0x80000000;
1283   - } else {
1284   - T0 = res;
1285   - T1 = res >> 32;
1286   - }
1287   - FORCE_RET();
1288   -}
1289   -
1290   -NEON_OP(subl_saturate_u64)
1291   -{
1292   - uint64_t src1;
1293   - uint64_t src2;
1294   - uint64_t res;
1295   -
1296   - src1 = T0 + ((uint64_t)T1 << 32);
1297   - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1298   - if (src1 < src2) {
1299   - env->QF = 1;
1300   - T0 = 0;
1301   - T1 = 0;
1302   - } else {
1303   - res = src1 - src2;
1304   - T0 = res;
1305   - T1 = res >> 32;
1306   - }
1307   - FORCE_RET();
1308   -}
1309   -
1310   -NEON_OP(negl_u16)
1311   -{
1312   - uint32_t tmp;
1313   - tmp = T0 >> 16;
1314   - tmp = -tmp;
1315   - T0 = (-T0 & 0xffff) | (tmp << 16);
1316   - tmp = T1 >> 16;
1317   - tmp = -tmp;
1318   - T1 = (-T1 & 0xffff) | (tmp << 16);
1319   - FORCE_RET();
1320   -}
1321   -
1322   -NEON_OP(negl_u32)
1323   -{
1324   - T0 = -T0;
1325   - T1 = -T1;
1326   - FORCE_RET();
1327   -}
1328   -
1329   -NEON_OP(negl_u64)
1330   -{
1331   - uint64_t val;
1332   -
1333   - val = T0 | ((uint64_t)T1 << 32);
1334   - val = -val;
1335   - T0 = val;
1336   - T1 = val >> 32;
1337   - FORCE_RET();
1338   -}
1339   -
1340   -/* Scalar operations. */
1341   -NEON_OP(dup_low16)
1342   -{
1343   - T0 = (T0 & 0xffff) | (T0 << 16);
1344   - FORCE_RET();
1345   -}
1346   -
1347   -NEON_OP(dup_high16)
1348   -{
1349   - T0 = (T0 >> 16) | (T0 & 0xffff0000);
1350   - FORCE_RET();
1351   -}
1352   -
1353   -/* Helper for VEXT */
1354   -NEON_OP(extract)
1355   -{
1356   - int shift = PARAM1;
1357   - T0 = (T0 >> shift) | (T1 << (32 - shift));
1358   - FORCE_RET();
1359   -}
1360   -
1361   -/* Pairwise add long. Named type is source type. */
1362   -NEON_OP(paddl_s8)
1363   -{
1364   - int8_t src1;
1365   - int8_t src2;
1366   - uint16_t result;
1367   - src1 = T0 >> 24;
1368   - src2 = T0 >> 16;
1369   - result = (uint16_t)src1 + src2;
1370   - src1 = T0 >> 8;
1371   - src2 = T0;
1372   - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1373   - FORCE_RET();
1374   -}
1375   -
1376   -NEON_OP(paddl_u8)
1377   -{
1378   - uint8_t src1;
1379   - uint8_t src2;
1380   - uint16_t result;
1381   - src1 = T0 >> 24;
1382   - src2 = T0 >> 16;
1383   - result = (uint16_t)src1 + src2;
1384   - src1 = T0 >> 8;
1385   - src2 = T0;
1386   - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1387   - FORCE_RET();
1388   -}
1389   -
1390   -NEON_OP(paddl_s16)
1391   -{
1392   - T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);
1393   - FORCE_RET();
1394   -}
1395   -
1396   -NEON_OP(paddl_u16)
1397   -{
1398   - T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);
1399   - FORCE_RET();
1400   -}
1401   -
1402   -NEON_OP(paddl_s32)
1403   -{
1404   - int64_t tmp;
1405   - tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;
1406   - T0 = tmp;
1407   - T1 = tmp >> 32;
1408   - FORCE_RET();
1409   -}
1410   -
1411   -NEON_OP(paddl_u32)
1412   -{
1413   - uint64_t tmp;
1414   - tmp = (uint64_t)T0 + (uint64_t)T1;
1415   - T0 = tmp;
1416   - T1 = tmp >> 32;
1417   - FORCE_RET();
1418   -}
1419   -
1420   -/* Count Leading Sign/Zero Bits. */
1421   -static inline int do_clz8(uint8_t x)
1422   -{
1423   - int n;
1424   - for (n = 8; x; n--)
1425   - x >>= 1;
1426   - return n;
1427   -}
1428   -
1429   -static inline int do_clz16(uint16_t x)
1430   -{
1431   - int n;
1432   - for (n = 16; x; n--)
1433   - x >>= 1;
1434   - return n;
1435   -}
1436   -
1437   -NEON_OP(clz_u8)
1438   -{
1439   - uint32_t result;
1440   - uint32_t tmp;
1441   -
1442   - tmp = T0;
1443   - result = do_clz8(tmp);
1444   - result |= do_clz8(tmp >> 8) << 8;
1445   - result |= do_clz8(tmp >> 16) << 16;
1446   - result |= do_clz8(tmp >> 24) << 24;
1447   - T0 = result;
1448   - FORCE_RET();
1449   -}
1450   -
1451   -NEON_OP(clz_u16)
1452   -{
1453   - uint32_t result;
1454   - uint32_t tmp;
1455   - tmp = T0;
1456   - result = do_clz16(tmp);
1457   - result |= do_clz16(tmp >> 16) << 16;
1458   - T0 = result;
1459   - FORCE_RET();
1460   -}
1461   -
1462   -NEON_OP(cls_s8)
1463   -{
1464   - uint32_t result;
1465   - int8_t tmp;
1466   - tmp = T0;
1467   - result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;
1468   - tmp = T0 >> 8;
1469   - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;
1470   - tmp = T0 >> 16;
1471   - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1472   - tmp = T0 >> 24;
1473   - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;
1474   - T0 = result;
1475   - FORCE_RET();
1476   -}
1477   -
1478   -NEON_OP(cls_s16)
1479   -{
1480   - uint32_t result;
1481   - int16_t tmp;
1482   - tmp = T0;
1483   - result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;
1484   - tmp = T0 >> 16;
1485   - result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1486   - T0 = result;
1487   - FORCE_RET();
1488   -}
1489   -
1490   -NEON_OP(cls_s32)
1491   -{
1492   - int count;
1493   - if ((int32_t)T0 < 0)
1494   - T0 = ~T0;
1495   - for (count = 32; T0 > 0; count--)
1496   - T0 = T0 >> 1;
1497   - T0 = count - 1;
1498   - FORCE_RET();
1499   -}
1500   -
1501   -/* Bit count. */
1502   -NEON_OP(cnt_u8)
1503   -{
1504   - T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555);
1505   - T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333);
1506   - T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f);
1507   - FORCE_RET();
1508   -}
1509   -
1510   -/* Saturnating negation. */
1511   -/* ??? Make these use NEON_VOP1 */
1512   -#define DO_QABS8(x) do { \
1513   - if (x == (int8_t)0x80) { \
1514   - x = 0x7f; \
1515   - env->QF = 1; \
1516   - } else if (x < 0) { \
1517   - x = -x; \
1518   - }} while (0)
1519   -NEON_OP(qabs_s8)
1520   -{
1521   - neon_s8 vec;
1522   - NEON_UNPACK(neon_s8, vec, T0);
1523   - DO_QABS8(vec.v1);
1524   - DO_QABS8(vec.v2);
1525   - DO_QABS8(vec.v3);
1526   - DO_QABS8(vec.v4);
1527   - NEON_PACK(neon_s8, T0, vec);
1528   - FORCE_RET();
1529   -}
1530   -#undef DO_QABS8
1531   -
1532   -#define DO_QNEG8(x) do { \
1533   - if (x == (int8_t)0x80) { \
1534   - x = 0x7f; \
1535   - env->QF = 1; \
1536   - } else { \
1537   - x = -x; \
1538   - }} while (0)
1539   -NEON_OP(qneg_s8)
1540   -{
1541   - neon_s8 vec;
1542   - NEON_UNPACK(neon_s8, vec, T0);
1543   - DO_QNEG8(vec.v1);
1544   - DO_QNEG8(vec.v2);
1545   - DO_QNEG8(vec.v3);
1546   - DO_QNEG8(vec.v4);
1547   - NEON_PACK(neon_s8, T0, vec);
1548   - FORCE_RET();
1549   -}
1550   -#undef DO_QNEG8
1551   -
1552   -#define DO_QABS16(x) do { \
1553   - if (x == (int16_t)0x8000) { \
1554   - x = 0x7fff; \
1555   - env->QF = 1; \
1556   - } else if (x < 0) { \
1557   - x = -x; \
1558   - }} while (0)
1559   -NEON_OP(qabs_s16)
1560   -{
1561   - neon_s16 vec;
1562   - NEON_UNPACK(neon_s16, vec, T0);
1563   - DO_QABS16(vec.v1);
1564   - DO_QABS16(vec.v2);
1565   - NEON_PACK(neon_s16, T0, vec);
1566   - FORCE_RET();
1567   -}
1568   -#undef DO_QABS16
1569   -
1570   -#define DO_QNEG16(x) do { \
1571   - if (x == (int16_t)0x8000) { \
1572   - x = 0x7fff; \
1573   - env->QF = 1; \
1574   - } else { \
1575   - x = -x; \
1576   - }} while (0)
1577   -NEON_OP(qneg_s16)
1578   -{
1579   - neon_s16 vec;
1580   - NEON_UNPACK(neon_s16, vec, T0);
1581   - DO_QNEG16(vec.v1);
1582   - DO_QNEG16(vec.v2);
1583   - NEON_PACK(neon_s16, T0, vec);
1584   - FORCE_RET();
1585   -}
1586   -#undef DO_QNEG16
1587   -
1588   -NEON_OP(qabs_s32)
1589   -{
1590   - if (T0 == 0x80000000) {
1591   - T0 = 0x7fffffff;
1592   - env->QF = 1;
1593   - } else if ((int32_t)T0 < 0) {
1594   - T0 = -T0;
1595   - }
1596   - FORCE_RET();
1597   -}
1598   -
1599   -NEON_OP(qneg_s32)
1600   -{
1601   - if (T0 == 0x80000000) {
1602   - T0 = 0x7fffffff;
1603   - env->QF = 1;
1604   - } else {
1605   - T0 = -T0;
1606   - }
1607   - FORCE_RET();
1608   -}
1609   -
1610   -/* Unary opperations */
1611   -#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1612   -NEON_VOP1(abs_s8, neon_s8, 4)
1613   -NEON_VOP1(abs_s16, neon_s16, 2)
1614   -NEON_OP(abs_s32)
1615   -{
1616   - if ((int32_t)T0 < 0)
1617   - T0 = -T0;
1618   - FORCE_RET();
1619   -}
1620   -#undef NEON_FN
1621   -
1622   -/* Transpose. Argument order is rather strange to avoid special casing
1623   - the tranlation code.
1624   - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
1625   -NEON_OP(trn_u8)
1626   -{
1627   - uint32_t rd;
1628   - uint32_t rm;
1629   - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
1630   - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
1631   - T0 = rd;
1632   - T1 = rm;
1633   - FORCE_RET();
1634   -}
1635   -
1636   -NEON_OP(trn_u16)
1637   -{
1638   - uint32_t rd;
1639   - uint32_t rm;
1640   - rd = (T0 << 16) | (T1 & 0xffff);
1641   - rm = (T1 >> 16) | (T0 & 0xffff0000);
1642   - T0 = rd;
1643   - T1 = rm;
1644   - FORCE_RET();
1645   -}
1646   -
1647   -/* Worker routines for zip and unzip. */
1648   -NEON_OP(unzip_u8)
1649   -{
1650   - uint32_t rd;
1651   - uint32_t rm;
1652   - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
1653   - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
1654   - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
1655   - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
1656   - T0 = rd;
1657   - T1 = rm;
1658   - FORCE_RET();
1659   -}
1660   -
1661   -NEON_OP(zip_u8)
1662   -{
1663   - uint32_t rd;
1664   - uint32_t rm;
1665   - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
1666   - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
1667   - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
1668   - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
1669   - T0 = rd;
1670   - T1 = rm;
1671   - FORCE_RET();
1672   -}
1673   -
1674   -NEON_OP(zip_u16)
1675   -{
1676   - uint32_t tmp;
1677   -
1678   - tmp = (T0 & 0xffff) | (T1 << 16);
1679   - T1 = (T1 & 0xffff0000) | (T0 >> 16);
1680   - T0 = tmp;
1681   - FORCE_RET();
1682   -}
1683   -
1684   -NEON_OP(dup_u8)
1685   -{
1686   - T0 = (T0 >> PARAM1) & 0xff;
1687   - T0 |= T0 << 8;
1688   - T0 |= T0 << 16;
1689   - FORCE_RET();
1690   -}
target-arm/translate.c
... ... @@ -77,6 +77,9 @@ extern FILE *logfile;
77 77 extern int loglevel;
78 78  
79 79 static TCGv cpu_env;
  80 +/* We reuse the same 64-bit temporaries for efficiency. */
  81 +static TCGv cpu_V0, cpu_V1;
  82 +
80 83 /* FIXME: These should be removed. */
81 84 static TCGv cpu_T[2];
82 85 static TCGv cpu_F0s, cpu_F1s, cpu_F0d, cpu_F1d;
... ... @@ -469,6 +472,9 @@ static inline void gen_op_bicl_T0_T1(void)
469 472 }
470 473  
471 474 /* FIXME: Implement this natively. */
  475 +#define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1)
  476 +
  477 +/* FIXME: Implement this natively. */
472 478 static void tcg_gen_rori_i32(TCGv t0, TCGv t1, int i)
473 479 {
474 480 TCGv tmp;
... ... @@ -1166,8 +1172,13 @@ neon_reg_offset (int reg, int n)
1166 1172 return vfp_reg_offset(0, sreg);
1167 1173 }
1168 1174  
1169   -#define NEON_GET_REG(T, reg, n) gen_op_neon_getreg_##T(neon_reg_offset(reg, n))
1170   -#define NEON_SET_REG(T, reg, n) gen_op_neon_setreg_##T(neon_reg_offset(reg, n))
  1175 +/* FIXME: Remove these. */
  1176 +#define neon_T0 cpu_T[0]
  1177 +#define neon_T1 cpu_T[1]
  1178 +#define NEON_GET_REG(T, reg, n) \
  1179 + tcg_gen_ld_i32(neon_##T, cpu_env, neon_reg_offset(reg, n))
  1180 +#define NEON_SET_REG(T, reg, n) \
  1181 + tcg_gen_st_i32(neon_##T, cpu_env, neon_reg_offset(reg, n))
1171 1182  
1172 1183 static TCGv neon_load_reg(int reg, int pass)
1173 1184 {
... ... @@ -1182,6 +1193,16 @@ static void neon_store_reg(int reg, int pass, TCGv var)
1182 1193 dead_tmp(var);
1183 1194 }
1184 1195  
  1196 +static inline void neon_load_reg64(TCGv var, int reg)
  1197 +{
  1198 + tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(1, reg));
  1199 +}
  1200 +
  1201 +static inline void neon_store_reg64(TCGv var, int reg)
  1202 +{
  1203 + tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(1, reg));
  1204 +}
  1205 +
1185 1206 #define tcg_gen_ld_f32 tcg_gen_ld_i32
1186 1207 #define tcg_gen_ld_f64 tcg_gen_ld_i64
1187 1208 #define tcg_gen_st_f32 tcg_gen_st_i32
... ... @@ -2418,6 +2439,37 @@ vfp_enabled(CPUState * env)
2418 2439 return ((env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)) != 0);
2419 2440 }
2420 2441  
  2442 +static void gen_neon_dup_u8(TCGv var, int shift)
  2443 +{
  2444 + TCGv tmp = new_tmp();
  2445 + if (shift)
  2446 + tcg_gen_shri_i32(var, var, shift);
  2447 + tcg_gen_andi_i32(var, var, 0xff);
  2448 + tcg_gen_shli_i32(tmp, var, 8);
  2449 + tcg_gen_or_i32(var, var, tmp);
  2450 + tcg_gen_shli_i32(tmp, var, 16);
  2451 + tcg_gen_or_i32(var, var, tmp);
  2452 + dead_tmp(tmp);
  2453 +}
  2454 +
  2455 +static void gen_neon_dup_low16(TCGv var)
  2456 +{
  2457 + TCGv tmp = new_tmp();
  2458 + tcg_gen_andi_i32(var, var, 0xffff);
  2459 + tcg_gen_shli_i32(tmp, var, 16);
  2460 + tcg_gen_or_i32(var, var, tmp);
  2461 + dead_tmp(tmp);
  2462 +}
  2463 +
  2464 +static void gen_neon_dup_high16(TCGv var)
  2465 +{
  2466 + TCGv tmp = new_tmp();
  2467 + tcg_gen_andi_i32(var, var, 0xffff0000);
  2468 + tcg_gen_shri_i32(tmp, var, 16);
  2469 + tcg_gen_or_i32(var, var, tmp);
  2470 + dead_tmp(tmp);
  2471 +}
  2472 +
2421 2473 /* Disassemble a VFP instruction. Returns nonzero if an error occured
2422 2474 (ie. an undefined instruction). */
2423 2475 static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
... ... @@ -2425,6 +2477,7 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
2425 2477 uint32_t rd, rn, rm, op, i, n, offset, delta_d, delta_m, bank_mask;
2426 2478 int dp, veclen;
2427 2479 TCGv tmp;
  2480 + TCGv tmp2;
2428 2481  
2429 2482 if (!arm_feature(env, ARM_FEATURE_VFP))
2430 2483 return 1;
... ... @@ -2468,66 +2521,66 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
2468 2521 }
2469 2522 if (insn & ARM_CP_RW_BIT) {
2470 2523 /* vfp->arm */
  2524 + tmp = neon_load_reg(rn, pass);
2471 2525 switch (size) {
2472 2526 case 0:
2473   - NEON_GET_REG(T1, rn, pass);
2474 2527 if (offset)
2475   - gen_op_shrl_T1_im(offset);
  2528 + tcg_gen_shri_i32(tmp, tmp, offset);
2476 2529 if (insn & (1 << 23))
2477   - gen_uxtb(cpu_T[1]);
  2530 + gen_uxtb(tmp);
2478 2531 else
2479   - gen_sxtb(cpu_T[1]);
  2532 + gen_sxtb(tmp);
2480 2533 break;
2481 2534 case 1:
2482   - NEON_GET_REG(T1, rn, pass);
2483 2535 if (insn & (1 << 23)) {
2484 2536 if (offset) {
2485   - gen_op_shrl_T1_im(16);
  2537 + tcg_gen_shri_i32(tmp, tmp, 16);
2486 2538 } else {
2487   - gen_uxth(cpu_T[1]);
  2539 + gen_uxth(tmp);
2488 2540 }
2489 2541 } else {
2490 2542 if (offset) {
2491   - gen_op_sarl_T1_im(16);
  2543 + tcg_gen_sari_i32(tmp, tmp, 16);
2492 2544 } else {
2493   - gen_sxth(cpu_T[1]);
  2545 + gen_sxth(tmp);
2494 2546 }
2495 2547 }
2496 2548 break;
2497 2549 case 2:
2498   - NEON_GET_REG(T1, rn, pass);
2499 2550 break;
2500 2551 }
2501   - gen_movl_reg_T1(s, rd);
  2552 + store_reg(s, rd, tmp);
2502 2553 } else {
2503 2554 /* arm->vfp */
2504   - gen_movl_T0_reg(s, rd);
  2555 + tmp = load_reg(s, rd);
2505 2556 if (insn & (1 << 23)) {
2506 2557 /* VDUP */
2507 2558 if (size == 0) {
2508   - gen_op_neon_dup_u8(0);
  2559 + gen_neon_dup_u8(tmp, 0);
2509 2560 } else if (size == 1) {
2510   - gen_op_neon_dup_low16();
  2561 + gen_neon_dup_low16(tmp);
2511 2562 }
2512   - NEON_SET_REG(T0, rn, 0);
2513   - NEON_SET_REG(T0, rn, 1);
  2563 + tmp2 = new_tmp();
  2564 + tcg_gen_mov_i32(tmp2, tmp);
  2565 + neon_store_reg(rn, 0, tmp2);
  2566 + neon_store_reg(rn, 0, tmp);
2514 2567 } else {
2515 2568 /* VMOV */
2516 2569 switch (size) {
2517 2570 case 0:
2518   - tmp = neon_load_reg(rn, pass);
2519   - gen_bfi(tmp, tmp, cpu_T[0], offset, 0xff);
2520   - neon_store_reg(rn, pass, tmp);
  2571 + tmp2 = neon_load_reg(rn, pass);
  2572 + gen_bfi(tmp, tmp2, tmp, offset, 0xff);
  2573 + dead_tmp(tmp2);
2521 2574 break;
2522 2575 case 1:
2523   - tmp = neon_load_reg(rn, pass);
2524   - gen_bfi(tmp, tmp, cpu_T[0], offset, 0xffff);
2525   - neon_store_reg(rn, pass, tmp);
  2576 + tmp2 = neon_load_reg(rn, pass);
  2577 + gen_bfi(tmp, tmp2, tmp, offset, 0xffff);
  2578 + dead_tmp(tmp2);
2526 2579 break;
2527 2580 case 2:
2528   - NEON_SET_REG(T0, rn, pass);
2529 2581 break;
2530 2582 }
  2583 + neon_store_reg(rn, pass, tmp);
2531 2584 }
2532 2585 }
2533 2586 } else { /* !dp */
... ... @@ -3210,179 +3263,90 @@ static void gen_nop_hint(DisasContext *s, int val)
3210 3263 }
3211 3264 }
3212 3265  
3213   -/* Neon shift by constant. The actual ops are the same as used for variable
3214   - shifts. [OP][U][SIZE] */
3215   -static GenOpFunc *gen_neon_shift_im[8][2][4] = {
3216   - { /* 0 */ /* VSHR */
3217   - {
3218   - gen_op_neon_shl_u8,
3219   - gen_op_neon_shl_u16,
3220   - gen_op_neon_shl_u32,
3221   - gen_op_neon_shl_u64
3222   - }, {
3223   - gen_op_neon_shl_s8,
3224   - gen_op_neon_shl_s16,
3225   - gen_op_neon_shl_s32,
3226   - gen_op_neon_shl_s64
3227   - }
3228   - }, { /* 1 */ /* VSRA */
3229   - {
3230   - gen_op_neon_shl_u8,
3231   - gen_op_neon_shl_u16,
3232   - gen_op_neon_shl_u32,
3233   - gen_op_neon_shl_u64
3234   - }, {
3235   - gen_op_neon_shl_s8,
3236   - gen_op_neon_shl_s16,
3237   - gen_op_neon_shl_s32,
3238   - gen_op_neon_shl_s64
3239   - }
3240   - }, { /* 2 */ /* VRSHR */
3241   - {
3242   - gen_op_neon_rshl_u8,
3243   - gen_op_neon_rshl_u16,
3244   - gen_op_neon_rshl_u32,
3245   - gen_op_neon_rshl_u64
3246   - }, {
3247   - gen_op_neon_rshl_s8,
3248   - gen_op_neon_rshl_s16,
3249   - gen_op_neon_rshl_s32,
3250   - gen_op_neon_rshl_s64
3251   - }
3252   - }, { /* 3 */ /* VRSRA */
3253   - {
3254   - gen_op_neon_rshl_u8,
3255   - gen_op_neon_rshl_u16,
3256   - gen_op_neon_rshl_u32,
3257   - gen_op_neon_rshl_u64
3258   - }, {
3259   - gen_op_neon_rshl_s8,
3260   - gen_op_neon_rshl_s16,
3261   - gen_op_neon_rshl_s32,
3262   - gen_op_neon_rshl_s64
3263   - }
3264   - }, { /* 4 */
3265   - {
3266   - NULL, NULL, NULL, NULL
3267   - }, { /* VSRI */
3268   - gen_op_neon_shl_u8,
3269   - gen_op_neon_shl_u16,
3270   - gen_op_neon_shl_u32,
3271   - gen_op_neon_shl_u64,
3272   - }
3273   - }, { /* 5 */
3274   - { /* VSHL */
3275   - gen_op_neon_shl_u8,
3276   - gen_op_neon_shl_u16,
3277   - gen_op_neon_shl_u32,
3278   - gen_op_neon_shl_u64,
3279   - }, { /* VSLI */
3280   - gen_op_neon_shl_u8,
3281   - gen_op_neon_shl_u16,
3282   - gen_op_neon_shl_u32,
3283   - gen_op_neon_shl_u64,
3284   - }
3285   - }, { /* 6 */ /* VQSHL */
3286   - {
3287   - gen_op_neon_qshl_u8,
3288   - gen_op_neon_qshl_u16,
3289   - gen_op_neon_qshl_u32,
3290   - gen_op_neon_qshl_u64
3291   - }, {
3292   - gen_op_neon_qshl_s8,
3293   - gen_op_neon_qshl_s16,
3294   - gen_op_neon_qshl_s32,
3295   - gen_op_neon_qshl_s64
3296   - }
3297   - }, { /* 7 */ /* VQSHLU */
3298   - {
3299   - gen_op_neon_qshl_u8,
3300   - gen_op_neon_qshl_u16,
3301   - gen_op_neon_qshl_u32,
3302   - gen_op_neon_qshl_u64
3303   - }, {
3304   - gen_op_neon_qshl_u8,
3305   - gen_op_neon_qshl_u16,
3306   - gen_op_neon_qshl_u32,
3307   - gen_op_neon_qshl_u64
3308   - }
3309   - }
3310   -};
  3266 +/* These macros help make the code more readable when migrating from the
  3267 + old dyngen helpers. They should probably be removed when
  3268 + T0/T1 are removed. */
  3269 +#define CPU_T001 cpu_T[0], cpu_T[0], cpu_T[1]
  3270 +#define CPU_T0E01 cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]
3311 3271  
3312   -/* [R][U][size - 1] */
3313   -static GenOpFunc *gen_neon_shift_im_narrow[2][2][3] = {
3314   - {
3315   - {
3316   - gen_op_neon_shl_u16,
3317   - gen_op_neon_shl_u32,
3318   - gen_op_neon_shl_u64
3319   - }, {
3320   - gen_op_neon_shl_s16,
3321   - gen_op_neon_shl_s32,
3322   - gen_op_neon_shl_s64
3323   - }
3324   - }, {
3325   - {
3326   - gen_op_neon_rshl_u16,
3327   - gen_op_neon_rshl_u32,
3328   - gen_op_neon_rshl_u64
3329   - }, {
3330   - gen_op_neon_rshl_s16,
3331   - gen_op_neon_rshl_s32,
3332   - gen_op_neon_rshl_s64
3333   - }
3334   - }
3335   -};
3336   -
3337   -static inline void
3338   -gen_op_neon_narrow_u32 ()
3339   -{
3340   - /* No-op. */
3341   -}
3342   -
3343   -static GenOpFunc *gen_neon_narrow[3] = {
3344   - gen_op_neon_narrow_u8,
3345   - gen_op_neon_narrow_u16,
3346   - gen_op_neon_narrow_u32
3347   -};
3348   -
3349   -static GenOpFunc *gen_neon_narrow_satu[3] = {
3350   - gen_op_neon_narrow_sat_u8,
3351   - gen_op_neon_narrow_sat_u16,
3352   - gen_op_neon_narrow_sat_u32
3353   -};
3354   -
3355   -static GenOpFunc *gen_neon_narrow_sats[3] = {
3356   - gen_op_neon_narrow_sat_s8,
3357   - gen_op_neon_narrow_sat_s16,
3358   - gen_op_neon_narrow_sat_s32
3359   -};
  3272 +#define CPU_V001 cpu_V0, cpu_V0, cpu_V1
3360 3273  
3361 3274 static inline int gen_neon_add(int size)
3362 3275 {
3363 3276 switch (size) {
3364   - case 0: gen_op_neon_add_u8(); break;
3365   - case 1: gen_op_neon_add_u16(); break;
  3277 + case 0: gen_helper_neon_add_u8(CPU_T001); break;
  3278 + case 1: gen_helper_neon_add_u16(CPU_T001); break;
3366 3279 case 2: gen_op_addl_T0_T1(); break;
3367 3280 default: return 1;
3368 3281 }
3369 3282 return 0;
3370 3283 }
3371 3284  
3372   -/* 32-bit pairwise ops end up the same as the elementsise versions. */
3373   -#define gen_op_neon_pmax_s32 gen_op_neon_max_s32
3374   -#define gen_op_neon_pmax_u32 gen_op_neon_max_u32
3375   -#define gen_op_neon_pmin_s32 gen_op_neon_min_s32
3376   -#define gen_op_neon_pmin_u32 gen_op_neon_min_u32
  3285 +static inline void gen_neon_rsb(int size)
  3286 +{
  3287 + switch (size) {
  3288 + case 0: gen_helper_neon_sub_u8(cpu_T[0], cpu_T[1], cpu_T[0]); break;
  3289 + case 1: gen_helper_neon_sub_u16(cpu_T[0], cpu_T[1], cpu_T[0]); break;
  3290 + case 2: gen_op_rsbl_T0_T1(); break;
  3291 + default: return;
  3292 + }
  3293 +}
  3294 +
  3295 +/* 32-bit pairwise ops end up the same as the elementwise versions. */
  3296 +#define gen_helper_neon_pmax_s32 gen_helper_neon_max_s32
  3297 +#define gen_helper_neon_pmax_u32 gen_helper_neon_max_u32
  3298 +#define gen_helper_neon_pmin_s32 gen_helper_neon_min_s32
  3299 +#define gen_helper_neon_pmin_u32 gen_helper_neon_min_u32
  3300 +
  3301 +/* FIXME: This is wrong. They set the wrong overflow bit. */
  3302 +#define gen_helper_neon_qadd_s32(a, e, b, c) gen_helper_add_saturate(a, b, c)
  3303 +#define gen_helper_neon_qadd_u32(a, e, b, c) gen_helper_add_usaturate(a, b, c)
  3304 +#define gen_helper_neon_qsub_s32(a, e, b, c) gen_helper_sub_saturate(a, b, c)
  3305 +#define gen_helper_neon_qsub_u32(a, e, b, c) gen_helper_sub_usaturate(a, b, c)
  3306 +
  3307 +#define GEN_NEON_INTEGER_OP_ENV(name) do { \
  3308 + switch ((size << 1) | u) { \
  3309 + case 0: \
  3310 + gen_helper_neon_##name##_s8(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3311 + break; \
  3312 + case 1: \
  3313 + gen_helper_neon_##name##_u8(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3314 + break; \
  3315 + case 2: \
  3316 + gen_helper_neon_##name##_s16(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3317 + break; \
  3318 + case 3: \
  3319 + gen_helper_neon_##name##_u16(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3320 + break; \
  3321 + case 4: \
  3322 + gen_helper_neon_##name##_s32(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3323 + break; \
  3324 + case 5: \
  3325 + gen_helper_neon_##name##_u32(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3326 + break; \
  3327 + default: return 1; \
  3328 + }} while (0)
3377 3329  
3378 3330 #define GEN_NEON_INTEGER_OP(name) do { \
3379 3331 switch ((size << 1) | u) { \
3380   - case 0: gen_op_neon_##name##_s8(); break; \
3381   - case 1: gen_op_neon_##name##_u8(); break; \
3382   - case 2: gen_op_neon_##name##_s16(); break; \
3383   - case 3: gen_op_neon_##name##_u16(); break; \
3384   - case 4: gen_op_neon_##name##_s32(); break; \
3385   - case 5: gen_op_neon_##name##_u32(); break; \
  3332 + case 0: \
  3333 + gen_helper_neon_##name##_s8(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3334 + break; \
  3335 + case 1: \
  3336 + gen_helper_neon_##name##_u8(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3337 + break; \
  3338 + case 2: \
  3339 + gen_helper_neon_##name##_s16(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3340 + break; \
  3341 + case 3: \
  3342 + gen_helper_neon_##name##_u16(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3343 + break; \
  3344 + case 4: \
  3345 + gen_helper_neon_##name##_s32(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3346 + break; \
  3347 + case 5: \
  3348 + gen_helper_neon_##name##_u32(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3349 + break; \
3386 3350 default: return 1; \
3387 3351 }} while (0)
3388 3352  
... ... @@ -3392,7 +3356,7 @@ gen_neon_movl_scratch_T0(int scratch)
3392 3356 uint32_t offset;
3393 3357  
3394 3358 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3395   - gen_op_neon_setreg_T0(offset);
  3359 + tcg_gen_st_i32(cpu_T[0], cpu_env, offset);
3396 3360 }
3397 3361  
3398 3362 static inline void
... ... @@ -3401,7 +3365,7 @@ gen_neon_movl_scratch_T1(int scratch)
3401 3365 uint32_t offset;
3402 3366  
3403 3367 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3404   - gen_op_neon_setreg_T1(offset);
  3368 + tcg_gen_st_i32(cpu_T[1], cpu_env, offset);
3405 3369 }
3406 3370  
3407 3371 static inline void
... ... @@ -3410,7 +3374,7 @@ gen_neon_movl_T0_scratch(int scratch)
3410 3374 uint32_t offset;
3411 3375  
3412 3376 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3413   - gen_op_neon_getreg_T0(offset);
  3377 + tcg_gen_ld_i32(cpu_T[0], cpu_env, offset);
3414 3378 }
3415 3379  
3416 3380 static inline void
... ... @@ -3419,12 +3383,7 @@ gen_neon_movl_T1_scratch(int scratch)
3419 3383 uint32_t offset;
3420 3384  
3421 3385 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3422   - gen_op_neon_getreg_T1(offset);
3423   -}
3424   -
3425   -static inline void gen_op_neon_widen_u32(void)
3426   -{
3427   - gen_op_movl_T1_im(0);
  3386 + tcg_gen_ld_i32(cpu_T[1], cpu_env, offset);
3428 3387 }
3429 3388  
3430 3389 static inline void gen_neon_get_scalar(int size, int reg)
... ... @@ -3434,9 +3393,9 @@ static inline void gen_neon_get_scalar(int size, int reg)
3434 3393 } else {
3435 3394 NEON_GET_REG(T0, reg >> 2, (reg >> 1) & 1);
3436 3395 if (reg & 1)
3437   - gen_op_neon_dup_low16();
  3396 + gen_neon_dup_low16(cpu_T[0]);
3438 3397 else
3439   - gen_op_neon_dup_high16();
  3398 + gen_neon_dup_high16(cpu_T[0]);
3440 3399 }
3441 3400 }
3442 3401  
... ... @@ -3448,8 +3407,8 @@ static void gen_neon_unzip(int reg, int q, int tmp, int size)
3448 3407 NEON_GET_REG(T0, reg, n);
3449 3408 NEON_GET_REG(T0, reg, n + n);
3450 3409 switch (size) {
3451   - case 0: gen_op_neon_unzip_u8(); break;
3452   - case 1: gen_op_neon_zip_u16(); break; /* zip and unzip are the same. */
  3410 + case 0: gen_helper_neon_unzip_u8(); break;
  3411 + case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */
3453 3412 case 2: /* no-op */; break;
3454 3413 default: abort();
3455 3414 }
... ... @@ -3522,13 +3481,9 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
3522 3481 if (size == 2) {
3523 3482 if (load) {
3524 3483 tmp = gen_ld32(cpu_T[1], IS_USER(s));
3525   - tcg_gen_mov_i32(cpu_T[0], tmp);
3526   - dead_tmp(tmp);
3527   - NEON_SET_REG(T0, rd, pass);
  3484 + neon_store_reg(rd, pass, tmp);
3528 3485 } else {
3529   - NEON_GET_REG(T0, rd, pass);
3530   - tmp = new_tmp();
3531   - tcg_gen_mov_i32(tmp, cpu_T[0]);
  3486 + tmp = neon_load_reg(rd, pass);
3532 3487 gen_st32(tmp, cpu_T[1], IS_USER(s));
3533 3488 }
3534 3489 gen_op_addl_T1_im(stride);
... ... @@ -3596,27 +3551,23 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
3596 3551 switch (size) {
3597 3552 case 0:
3598 3553 tmp = gen_ld8u(cpu_T[1], IS_USER(s));
3599   - tcg_gen_mov_i32(cpu_T[0], tmp);
3600   - dead_tmp(tmp);
3601   - gen_op_neon_dup_u8(0);
  3554 + gen_neon_dup_u8(tmp, 0);
3602 3555 break;
3603 3556 case 1:
3604 3557 tmp = gen_ld16u(cpu_T[1], IS_USER(s));
3605   - tcg_gen_mov_i32(cpu_T[0], tmp);
3606   - dead_tmp(tmp);
3607   - gen_op_neon_dup_low16();
  3558 + gen_neon_dup_low16(tmp);
3608 3559 break;
3609 3560 case 2:
3610 3561 tmp = gen_ld32(cpu_T[0], IS_USER(s));
3611   - tcg_gen_mov_i32(cpu_T[0], tmp);
3612   - dead_tmp(tmp);
3613 3562 break;
3614 3563 case 3:
3615 3564 return 1;
3616 3565 }
3617 3566 gen_op_addl_T1_im(1 << size);
3618   - NEON_SET_REG(T0, rd, 0);
3619   - NEON_SET_REG(T0, rd, 1);
  3567 + tmp2 = new_tmp();
  3568 + tcg_gen_mov_i32(tmp2, tmp);
  3569 + neon_store_reg(rd, 0, tmp2);
  3570 + neon_store_reg(rd, 0, tmp);
3620 3571 rd += stride;
3621 3572 }
3622 3573 stride = (1 << size) * nregs;
... ... @@ -3707,12 +3658,158 @@ static void gen_neon_bsl(TCGv dest, TCGv t, TCGv f, TCGv c)
3707 3658 tcg_gen_or_i32(dest, t, f);
3708 3659 }
3709 3660  
  3661 +static inline void gen_neon_narrow(int size, TCGv dest, TCGv src)
  3662 +{
  3663 + switch (size) {
  3664 + case 0: gen_helper_neon_narrow_u8(dest, src); break;
  3665 + case 1: gen_helper_neon_narrow_u16(dest, src); break;
  3666 + case 2: tcg_gen_trunc_i64_i32(dest, src); break;
  3667 + default: abort();
  3668 + }
  3669 +}
  3670 +
  3671 +static inline void gen_neon_narrow_sats(int size, TCGv dest, TCGv src)
  3672 +{
  3673 + switch (size) {
  3674 + case 0: gen_helper_neon_narrow_sat_s8(dest, cpu_env, src); break;
  3675 + case 1: gen_helper_neon_narrow_sat_s16(dest, cpu_env, src); break;
  3676 + case 2: gen_helper_neon_narrow_sat_s32(dest, cpu_env, src); break;
  3677 + default: abort();
  3678 + }
  3679 +}
  3680 +
  3681 +static inline void gen_neon_narrow_satu(int size, TCGv dest, TCGv src)
  3682 +{
  3683 + switch (size) {
  3684 + case 0: gen_helper_neon_narrow_sat_u8(dest, cpu_env, src); break;
  3685 + case 1: gen_helper_neon_narrow_sat_u16(dest, cpu_env, src); break;
  3686 + case 2: gen_helper_neon_narrow_sat_u32(dest, cpu_env, src); break;
  3687 + default: abort();
  3688 + }
  3689 +}
  3690 +
  3691 +static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift,
  3692 + int q, int u)
  3693 +{
  3694 + if (q) {
  3695 + if (u) {
  3696 + switch (size) {
  3697 + case 1: gen_helper_neon_rshl_u16(var, var, shift); break;
  3698 + case 2: gen_helper_neon_rshl_u32(var, var, shift); break;
  3699 + default: abort();
  3700 + }
  3701 + } else {
  3702 + switch (size) {
  3703 + case 1: gen_helper_neon_rshl_s16(var, var, shift); break;
  3704 + case 2: gen_helper_neon_rshl_s32(var, var, shift); break;
  3705 + default: abort();
  3706 + }
  3707 + }
  3708 + } else {
  3709 + if (u) {
  3710 + switch (size) {
  3711 + case 1: gen_helper_neon_rshl_u16(var, var, shift); break;
  3712 + case 2: gen_helper_neon_rshl_u32(var, var, shift); break;
  3713 + default: abort();
  3714 + }
  3715 + } else {
  3716 + switch (size) {
  3717 + case 1: gen_helper_neon_shl_s16(var, var, shift); break;
  3718 + case 2: gen_helper_neon_shl_s32(var, var, shift); break;
  3719 + default: abort();
  3720 + }
  3721 + }
  3722 + }
  3723 +}
  3724 +
  3725 +static inline void gen_neon_widen(TCGv dest, TCGv src, int size, int u)
  3726 +{
  3727 + if (u) {
  3728 + switch (size) {
  3729 + case 0: gen_helper_neon_widen_u8(dest, src); break;
  3730 + case 1: gen_helper_neon_widen_u16(dest, src); break;
  3731 + case 2: tcg_gen_extu_i32_i64(dest, src); break;
  3732 + default: abort();
  3733 + }
  3734 + } else {
  3735 + switch (size) {
  3736 + case 0: gen_helper_neon_widen_s8(dest, src); break;
  3737 + case 1: gen_helper_neon_widen_s16(dest, src); break;
  3738 + case 2: tcg_gen_ext_i32_i64(dest, src); break;
  3739 + default: abort();
  3740 + }
  3741 + }
  3742 + dead_tmp(src);
  3743 +}
  3744 +
  3745 +static inline void gen_neon_addl(int size)
  3746 +{
  3747 + switch (size) {
  3748 + case 0: gen_helper_neon_addl_u16(CPU_V001); break;
  3749 + case 1: gen_helper_neon_addl_u32(CPU_V001); break;
  3750 + case 2: tcg_gen_add_i64(CPU_V001); break;
  3751 + default: abort();
  3752 + }
  3753 +}
  3754 +
  3755 +static inline void gen_neon_subl(int size)
  3756 +{
  3757 + switch (size) {
  3758 + case 0: gen_helper_neon_subl_u16(CPU_V001); break;
  3759 + case 1: gen_helper_neon_subl_u32(CPU_V001); break;
  3760 + case 2: tcg_gen_sub_i64(CPU_V001); break;
  3761 + default: abort();
  3762 + }
  3763 +}
  3764 +
  3765 +static inline void gen_neon_negl(TCGv var, int size)
  3766 +{
  3767 + switch (size) {
  3768 + case 0: gen_helper_neon_negl_u16(var, var); break;
  3769 + case 1: gen_helper_neon_negl_u32(var, var); break;
  3770 + case 2: gen_helper_neon_negl_u64(var, var); break;
  3771 + default: abort();
  3772 + }
  3773 +}
  3774 +
  3775 +static inline void gen_neon_addl_saturate(TCGv op0, TCGv op1, int size)
  3776 +{
  3777 + switch (size) {
  3778 + case 1: gen_helper_neon_addl_saturate_s32(op0, cpu_env, op0, op1); break;
  3779 + case 2: gen_helper_neon_addl_saturate_s64(op0, cpu_env, op0, op1); break;
  3780 + default: abort();
  3781 + }
  3782 +}
  3783 +
  3784 +static inline void gen_neon_mull(TCGv dest, TCGv a, TCGv b, int size, int u)
  3785 +{
  3786 + TCGv tmp;
  3787 +
  3788 + switch ((size << 1) | u) {
  3789 + case 0: gen_helper_neon_mull_s8(dest, a, b); break;
  3790 + case 1: gen_helper_neon_mull_u8(dest, a, b); break;
  3791 + case 2: gen_helper_neon_mull_s16(dest, a, b); break;
  3792 + case 3: gen_helper_neon_mull_u16(dest, a, b); break;
  3793 + case 4:
  3794 + tmp = gen_muls_i64_i32(a, b);
  3795 + tcg_gen_mov_i64(dest, tmp);
  3796 + break;
  3797 + case 5:
  3798 + tmp = gen_mulu_i64_i32(a, b);
  3799 + tcg_gen_mov_i64(dest, tmp);
  3800 + break;
  3801 + default: abort();
  3802 + }
  3803 + if (size < 2) {
  3804 + dead_tmp(b);
  3805 + dead_tmp(a);
  3806 + }
  3807 +}
  3808 +
3710 3809 /* Translate a NEON data processing instruction. Return nonzero if the
3711 3810 instruction is invalid.
3712   - In general we process vectors in 32-bit chunks. This means we can reuse
3713   - some of the scalar ops, and hopefully the code generated for 32-bit
3714   - hosts won't be too awful. The downside is that the few 64-bit operations
3715   - (mainly shifts) get complicated. */
  3811 + We process data in a mixture of 32-bit and 64-bit chunks.
  3812 + Mostly we use 32-bit chunks so we can use normal scalar instructions. */
3716 3813  
3717 3814 static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3718 3815 {
... ... @@ -3742,41 +3839,70 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3742 3839 if ((insn & (1 << 23)) == 0) {
3743 3840 /* Three register same length. */
3744 3841 op = ((insn >> 7) & 0x1e) | ((insn >> 4) & 1);
3745   - if (size == 3 && (op == 1 || op == 5 || op == 16)) {
  3842 + if (size == 3 && (op == 1 || op == 5 || op == 8 || op == 9
  3843 + || op == 10 || op == 11 || op == 16)) {
  3844 + /* 64-bit element instructions. */
3746 3845 for (pass = 0; pass < (q ? 2 : 1); pass++) {
3747   - NEON_GET_REG(T0, rm, pass * 2);
3748   - NEON_GET_REG(T1, rm, pass * 2 + 1);
3749   - gen_neon_movl_scratch_T0(0);
3750   - gen_neon_movl_scratch_T1(1);
3751   - NEON_GET_REG(T0, rn, pass * 2);
3752   - NEON_GET_REG(T1, rn, pass * 2 + 1);
  3846 + neon_load_reg64(cpu_V0, rn + pass);
  3847 + neon_load_reg64(cpu_V1, rm + pass);
3753 3848 switch (op) {
3754 3849 case 1: /* VQADD */
3755 3850 if (u) {
3756   - gen_op_neon_addl_saturate_u64();
  3851 + gen_helper_neon_add_saturate_u64(CPU_V001);
3757 3852 } else {
3758   - gen_op_neon_addl_saturate_s64();
  3853 + gen_helper_neon_add_saturate_s64(CPU_V001);
3759 3854 }
3760 3855 break;
3761 3856 case 5: /* VQSUB */
3762 3857 if (u) {
3763   - gen_op_neon_subl_saturate_u64();
  3858 + gen_helper_neon_sub_saturate_u64(CPU_V001);
3764 3859 } else {
3765   - gen_op_neon_subl_saturate_s64();
  3860 + gen_helper_neon_sub_saturate_s64(CPU_V001);
  3861 + }
  3862 + break;
  3863 + case 8: /* VSHL */
  3864 + if (u) {
  3865 + gen_helper_neon_shl_u64(cpu_V0, cpu_V1, cpu_V0);
  3866 + } else {
  3867 + gen_helper_neon_shl_s64(cpu_V0, cpu_V1, cpu_V0);
  3868 + }
  3869 + break;
  3870 + case 9: /* VQSHL */
  3871 + if (u) {
  3872 + gen_helper_neon_qshl_u64(cpu_V0, cpu_env,
  3873 + cpu_V0, cpu_V0);
  3874 + } else {
  3875 + gen_helper_neon_qshl_s64(cpu_V1, cpu_env,
  3876 + cpu_V1, cpu_V0);
  3877 + }
  3878 + break;
  3879 + case 10: /* VRSHL */
  3880 + if (u) {
  3881 + gen_helper_neon_rshl_u64(cpu_V0, cpu_V1, cpu_V0);
  3882 + } else {
  3883 + gen_helper_neon_rshl_s64(cpu_V0, cpu_V1, cpu_V0);
  3884 + }
  3885 + break;
  3886 + case 11: /* VQRSHL */
  3887 + if (u) {
  3888 + gen_helper_neon_qrshl_u64(cpu_V0, cpu_env,
  3889 + cpu_V1, cpu_V0);
  3890 + } else {
  3891 + gen_helper_neon_qrshl_s64(cpu_V0, cpu_env,
  3892 + cpu_V1, cpu_V0);
3766 3893 }
3767 3894 break;
3768 3895 case 16:
3769 3896 if (u) {
3770   - gen_op_neon_subl_u64();
  3897 + tcg_gen_sub_i64(CPU_V001);
3771 3898 } else {
3772   - gen_op_neon_addl_u64();
  3899 + tcg_gen_add_i64(CPU_V001);
3773 3900 }
3774 3901 break;
3775 3902 default:
3776 3903 abort();
3777 3904 }
3778   - NEON_SET_REG(T0, rd, pass * 2);
3779   - NEON_SET_REG(T1, rd, pass * 2 + 1);
  3905 + neon_store_reg64(cpu_V0, rd + pass);
3780 3906 }
3781 3907 return 0;
3782 3908 }
... ... @@ -3784,13 +3910,13 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3784 3910 case 8: /* VSHL */
3785 3911 case 9: /* VQSHL */
3786 3912 case 10: /* VRSHL */
3787   - case 11: /* VQSHL */
3788   - /* Shift operations have Rn and Rm reversed. */
  3913 + case 11: /* VQRSHL */
3789 3914 {
3790   - int tmp;
3791   - tmp = rn;
  3915 + int rtmp;
  3916 + /* Shift instruction operands are reversed. */
  3917 + rtmp = rn;
3792 3918 rn = rm;
3793   - rm = tmp;
  3919 + rm = rtmp;
3794 3920 pairwise = 0;
3795 3921 }
3796 3922 break;
... ... @@ -3834,19 +3960,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3834 3960 GEN_NEON_INTEGER_OP(hadd);
3835 3961 break;
3836 3962 case 1: /* VQADD */
3837   - switch (size << 1| u) {
3838   - case 0: gen_op_neon_qadd_s8(); break;
3839   - case 1: gen_op_neon_qadd_u8(); break;
3840   - case 2: gen_op_neon_qadd_s16(); break;
3841   - case 3: gen_op_neon_qadd_u16(); break;
3842   - case 4:
3843   - gen_helper_add_saturate(cpu_T[0], cpu_T[0], cpu_T[1]);
3844   - break;
3845   - case 5:
3846   - gen_helper_add_usaturate(cpu_T[0], cpu_T[0], cpu_T[1]);
3847   - break;
3848   - default: abort();
3849   - }
  3963 + GEN_NEON_INTEGER_OP_ENV(qadd);
3850 3964 break;
3851 3965 case 2: /* VRHADD */
3852 3966 GEN_NEON_INTEGER_OP(rhadd);
... ... @@ -3890,19 +4004,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3890 4004 GEN_NEON_INTEGER_OP(hsub);
3891 4005 break;
3892 4006 case 5: /* VQSUB */
3893   - switch ((size << 1) | u) {
3894   - case 0: gen_op_neon_qsub_s8(); break;
3895   - case 1: gen_op_neon_qsub_u8(); break;
3896   - case 2: gen_op_neon_qsub_s16(); break;
3897   - case 3: gen_op_neon_qsub_u16(); break;
3898   - case 4:
3899   - gen_helper_sub_saturate(cpu_T[0], cpu_T[0], cpu_T[1]);
3900   - break;
3901   - case 5:
3902   - gen_helper_sub_usaturate(cpu_T[0], cpu_T[0], cpu_T[1]);
3903   - break;
3904   - default: abort();
3905   - }
  4007 + GEN_NEON_INTEGER_OP_ENV(qsub);
3906 4008 break;
3907 4009 case 6: /* VCGT */
3908 4010 GEN_NEON_INTEGER_OP(cgt);
... ... @@ -3911,76 +4013,16 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3911 4013 GEN_NEON_INTEGER_OP(cge);
3912 4014 break;
3913 4015 case 8: /* VSHL */
3914   - switch ((size << 1) | u) {
3915   - case 0: gen_op_neon_shl_s8(); break;
3916   - case 1: gen_op_neon_shl_u8(); break;
3917   - case 2: gen_op_neon_shl_s16(); break;
3918   - case 3: gen_op_neon_shl_u16(); break;
3919   - case 4: gen_op_neon_shl_s32(); break;
3920   - case 5: gen_op_neon_shl_u32(); break;
3921   -#if 0
3922   - /* ??? Implementing these is tricky because the vector ops work
3923   - on 32-bit pieces. */
3924   - case 6: gen_op_neon_shl_s64(); break;
3925   - case 7: gen_op_neon_shl_u64(); break;
3926   -#else
3927   - case 6: case 7: cpu_abort(env, "VSHL.64 not implemented");
3928   -#endif
3929   - }
  4016 + GEN_NEON_INTEGER_OP(shl);
3930 4017 break;
3931 4018 case 9: /* VQSHL */
3932   - switch ((size << 1) | u) {
3933   - case 0: gen_op_neon_qshl_s8(); break;
3934   - case 1: gen_op_neon_qshl_u8(); break;
3935   - case 2: gen_op_neon_qshl_s16(); break;
3936   - case 3: gen_op_neon_qshl_u16(); break;
3937   - case 4: gen_op_neon_qshl_s32(); break;
3938   - case 5: gen_op_neon_qshl_u32(); break;
3939   -#if 0
3940   - /* ??? Implementing these is tricky because the vector ops work
3941   - on 32-bit pieces. */
3942   - case 6: gen_op_neon_qshl_s64(); break;
3943   - case 7: gen_op_neon_qshl_u64(); break;
3944   -#else
3945   - case 6: case 7: cpu_abort(env, "VQSHL.64 not implemented");
3946   -#endif
3947   - }
  4019 + GEN_NEON_INTEGER_OP_ENV(qshl);
3948 4020 break;
3949 4021 case 10: /* VRSHL */
3950   - switch ((size << 1) | u) {
3951   - case 0: gen_op_neon_rshl_s8(); break;
3952   - case 1: gen_op_neon_rshl_u8(); break;
3953   - case 2: gen_op_neon_rshl_s16(); break;
3954   - case 3: gen_op_neon_rshl_u16(); break;
3955   - case 4: gen_op_neon_rshl_s32(); break;
3956   - case 5: gen_op_neon_rshl_u32(); break;
3957   -#if 0
3958   - /* ??? Implementing these is tricky because the vector ops work
3959   - on 32-bit pieces. */
3960   - case 6: gen_op_neon_rshl_s64(); break;
3961   - case 7: gen_op_neon_rshl_u64(); break;
3962   -#else
3963   - case 6: case 7: cpu_abort(env, "VRSHL.64 not implemented");
3964   -#endif
3965   - }
  4022 + GEN_NEON_INTEGER_OP(rshl);
3966 4023 break;
3967 4024 case 11: /* VQRSHL */
3968   - switch ((size << 1) | u) {
3969   - case 0: gen_op_neon_qrshl_s8(); break;
3970   - case 1: gen_op_neon_qrshl_u8(); break;
3971   - case 2: gen_op_neon_qrshl_s16(); break;
3972   - case 3: gen_op_neon_qrshl_u16(); break;
3973   - case 4: gen_op_neon_qrshl_s32(); break;
3974   - case 5: gen_op_neon_qrshl_u32(); break;
3975   -#if 0
3976   - /* ??? Implementing these is tricky because the vector ops work
3977   - on 32-bit pieces. */
3978   - case 6: gen_op_neon_qrshl_s64(); break;
3979   - case 7: gen_op_neon_qrshl_u64(); break;
3980   -#else
3981   - case 6: case 7: cpu_abort(env, "VQRSHL.64 not implemented");
3982   -#endif
3983   - }
  4025 + GEN_NEON_INTEGER_OP_ENV(qrshl);
3984 4026 break;
3985 4027 case 12: /* VMAX */
3986 4028 GEN_NEON_INTEGER_OP(max);
... ... @@ -4002,8 +4044,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4002 4044 return 1;
4003 4045 } else { /* VSUB */
4004 4046 switch (size) {
4005   - case 0: gen_op_neon_sub_u8(); break;
4006   - case 1: gen_op_neon_sub_u16(); break;
  4047 + case 0: gen_helper_neon_sub_u8(CPU_T001); break;
  4048 + case 1: gen_helper_neon_sub_u16(CPU_T001); break;
4007 4049 case 2: gen_op_subl_T0_T1(); break;
4008 4050 default: return 1;
4009 4051 }
... ... @@ -4012,46 +4054,41 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4012 4054 case 17:
4013 4055 if (!u) { /* VTST */
4014 4056 switch (size) {
4015   - case 0: gen_op_neon_tst_u8(); break;
4016   - case 1: gen_op_neon_tst_u16(); break;
4017   - case 2: gen_op_neon_tst_u32(); break;
  4057 + case 0: gen_helper_neon_tst_u8(CPU_T001); break;
  4058 + case 1: gen_helper_neon_tst_u16(CPU_T001); break;
  4059 + case 2: gen_helper_neon_tst_u32(CPU_T001); break;
4018 4060 default: return 1;
4019 4061 }
4020 4062 } else { /* VCEQ */
4021 4063 switch (size) {
4022   - case 0: gen_op_neon_ceq_u8(); break;
4023   - case 1: gen_op_neon_ceq_u16(); break;
4024   - case 2: gen_op_neon_ceq_u32(); break;
  4064 + case 0: gen_helper_neon_ceq_u8(CPU_T001); break;
  4065 + case 1: gen_helper_neon_ceq_u16(CPU_T001); break;
  4066 + case 2: gen_helper_neon_ceq_u32(CPU_T001); break;
4025 4067 default: return 1;
4026 4068 }
4027 4069 }
4028 4070 break;
4029 4071 case 18: /* Multiply. */
4030 4072 switch (size) {
4031   - case 0: gen_op_neon_mul_u8(); break;
4032   - case 1: gen_op_neon_mul_u16(); break;
  4073 + case 0: gen_helper_neon_mul_u8(CPU_T001); break;
  4074 + case 1: gen_helper_neon_mul_u16(CPU_T001); break;
4033 4075 case 2: gen_op_mul_T0_T1(); break;
4034 4076 default: return 1;
4035 4077 }
4036 4078 NEON_GET_REG(T1, rd, pass);
4037 4079 if (u) { /* VMLS */
4038   - switch (size) {
4039   - case 0: gen_op_neon_rsb_u8(); break;
4040   - case 1: gen_op_neon_rsb_u16(); break;
4041   - case 2: gen_op_rsbl_T0_T1(); break;
4042   - default: return 1;
4043   - }
  4080 + gen_neon_rsb(size);
4044 4081 } else { /* VMLA */
4045 4082 gen_neon_add(size);
4046 4083 }
4047 4084 break;
4048 4085 case 19: /* VMUL */
4049 4086 if (u) { /* polynomial */
4050   - gen_op_neon_mul_p8();
  4087 + gen_helper_neon_mul_p8(CPU_T001);
4051 4088 } else { /* Integer */
4052 4089 switch (size) {
4053   - case 0: gen_op_neon_mul_u8(); break;
4054   - case 1: gen_op_neon_mul_u16(); break;
  4090 + case 0: gen_helper_neon_mul_u8(CPU_T001); break;
  4091 + case 1: gen_helper_neon_mul_u16(CPU_T001); break;
4055 4092 case 2: gen_op_mul_T0_T1(); break;
4056 4093 default: return 1;
4057 4094 }
... ... @@ -4066,14 +4103,14 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4066 4103 case 22: /* Hultiply high. */
4067 4104 if (!u) { /* VQDMULH */
4068 4105 switch (size) {
4069   - case 1: gen_op_neon_qdmulh_s16(); break;
4070   - case 2: gen_op_neon_qdmulh_s32(); break;
  4106 + case 1: gen_helper_neon_qdmulh_s16(CPU_T0E01); break;
  4107 + case 2: gen_helper_neon_qdmulh_s32(CPU_T0E01); break;
4071 4108 default: return 1;
4072 4109 }
4073 4110 } else { /* VQRDHMUL */
4074 4111 switch (size) {
4075   - case 1: gen_op_neon_qrdmulh_s16(); break;
4076   - case 2: gen_op_neon_qrdmulh_s32(); break;
  4112 + case 1: gen_helper_neon_qrdmulh_s16(CPU_T0E01); break;
  4113 + case 2: gen_helper_neon_qrdmulh_s32(CPU_T0E01); break;
4077 4114 default: return 1;
4078 4115 }
4079 4116 }
... ... @@ -4082,8 +4119,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4082 4119 if (u)
4083 4120 return 1;
4084 4121 switch (size) {
4085   - case 0: gen_op_neon_padd_u8(); break;
4086   - case 1: gen_op_neon_padd_u16(); break;
  4122 + case 0: gen_helper_neon_padd_u8(CPU_T001); break;
  4123 + case 1: gen_helper_neon_padd_u16(CPU_T001); break;
4087 4124 case 2: gen_op_addl_T0_T1(); break;
4088 4125 default: return 1;
4089 4126 }
... ... @@ -4091,55 +4128,55 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4091 4128 case 26: /* Floating point arithnetic. */
4092 4129 switch ((u << 2) | size) {
4093 4130 case 0: /* VADD */
4094   - gen_op_neon_add_f32();
  4131 + gen_helper_neon_add_f32(CPU_T001);
4095 4132 break;
4096 4133 case 2: /* VSUB */
4097   - gen_op_neon_sub_f32();
  4134 + gen_helper_neon_sub_f32(CPU_T001);
4098 4135 break;
4099 4136 case 4: /* VPADD */
4100   - gen_op_neon_add_f32();
  4137 + gen_helper_neon_add_f32(CPU_T001);
4101 4138 break;
4102 4139 case 6: /* VABD */
4103   - gen_op_neon_abd_f32();
  4140 + gen_helper_neon_abd_f32(CPU_T001);
4104 4141 break;
4105 4142 default:
4106 4143 return 1;
4107 4144 }
4108 4145 break;
4109 4146 case 27: /* Float multiply. */
4110   - gen_op_neon_mul_f32();
  4147 + gen_helper_neon_mul_f32(CPU_T001);
4111 4148 if (!u) {
4112 4149 NEON_GET_REG(T1, rd, pass);
4113 4150 if (size == 0) {
4114   - gen_op_neon_add_f32();
  4151 + gen_helper_neon_add_f32(CPU_T001);
4115 4152 } else {
4116   - gen_op_neon_rsb_f32();
  4153 + gen_helper_neon_sub_f32(cpu_T[0], cpu_T[1], cpu_T[0]);
4117 4154 }
4118 4155 }
4119 4156 break;
4120 4157 case 28: /* Float compare. */
4121 4158 if (!u) {
4122   - gen_op_neon_ceq_f32();
  4159 + gen_helper_neon_ceq_f32(CPU_T001);
4123 4160 } else {
4124 4161 if (size == 0)
4125   - gen_op_neon_cge_f32();
  4162 + gen_helper_neon_cge_f32(CPU_T001);
4126 4163 else
4127   - gen_op_neon_cgt_f32();
  4164 + gen_helper_neon_cgt_f32(CPU_T001);
4128 4165 }
4129 4166 break;
4130 4167 case 29: /* Float compare absolute. */
4131 4168 if (!u)
4132 4169 return 1;
4133 4170 if (size == 0)
4134   - gen_op_neon_acge_f32();
  4171 + gen_helper_neon_acge_f32(CPU_T001);
4135 4172 else
4136   - gen_op_neon_acgt_f32();
  4173 + gen_helper_neon_acgt_f32(CPU_T001);
4137 4174 break;
4138 4175 case 30: /* Float min/max. */
4139 4176 if (size == 0)
4140   - gen_op_neon_max_f32();
  4177 + gen_helper_neon_max_f32(CPU_T001);
4141 4178 else
4142   - gen_op_neon_min_f32();
  4179 + gen_helper_neon_min_f32(CPU_T001);
4143 4180 break;
4144 4181 case 31:
4145 4182 if (size == 0)
... ... @@ -4166,6 +4203,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4166 4203 NEON_SET_REG(T0, rd, pass);
4167 4204 }
4168 4205 }
  4206 + /* End of 3 register same size operations. */
4169 4207 } else if (insn & (1 << 4)) {
4170 4208 if ((insn & 0x00380080) != 0) {
4171 4209 /* Two registers and shift. */
... ... @@ -4212,181 +4250,221 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4212 4250 }
4213 4251  
4214 4252 for (pass = 0; pass < count; pass++) {
4215   - if (size < 3) {
4216   - /* Operands in T0 and T1. */
4217   - gen_op_movl_T1_im(imm);
4218   - NEON_GET_REG(T0, rm, pass);
4219   - } else {
4220   - /* Operands in {T0, T1} and env->vfp.scratch. */
4221   - gen_op_movl_T0_im(imm);
4222   - gen_neon_movl_scratch_T0(0);
4223   - gen_op_movl_T0_im((int32_t)imm >> 31);
4224   - gen_neon_movl_scratch_T0(1);
4225   - NEON_GET_REG(T0, rm, pass * 2);
4226   - NEON_GET_REG(T1, rm, pass * 2 + 1);
4227   - }
4228   -
4229   - if (gen_neon_shift_im[op][u][size] == NULL)
4230   - return 1;
4231   - gen_neon_shift_im[op][u][size]();
4232   -
4233   - if (op == 1 || op == 3) {
4234   - /* Accumulate. */
4235   - if (size == 3) {
4236   - gen_neon_movl_scratch_T0(0);
4237   - gen_neon_movl_scratch_T1(1);
4238   - NEON_GET_REG(T0, rd, pass * 2);
4239   - NEON_GET_REG(T1, rd, pass * 2 + 1);
4240   - gen_op_neon_addl_u64();
4241   - } else {
4242   - NEON_GET_REG(T1, rd, pass);
4243   - gen_neon_add(size);
4244   - }
4245   - } else if (op == 4 || (op == 5 && u)) {
4246   - /* Insert */
4247   - if (size == 3) {
4248   - cpu_abort(env, "VS[LR]I.64 not implemented");
4249   - }
4250   - switch (size) {
4251   - case 0:
4252   - if (op == 4)
4253   - imm = 0xff >> -shift;
  4253 + if (size == 3) {
  4254 + neon_load_reg64(cpu_V0, rm + pass);
  4255 + tcg_gen_movi_i64(cpu_V1, imm);
  4256 + switch (op) {
  4257 + case 0: /* VSHR */
  4258 + case 1: /* VSRA */
  4259 + if (u)
  4260 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, cpu_V1);
4254 4261 else
4255   - imm = (uint8_t)(0xff << shift);
4256   - imm |= imm << 8;
4257   - imm |= imm << 16;
  4262 + gen_helper_neon_shl_s64(cpu_V0, cpu_V0, cpu_V1);
4258 4263 break;
4259   - case 1:
4260   - if (op == 4)
4261   - imm = 0xffff >> -shift;
  4264 + case 2: /* VRSHR */
  4265 + case 3: /* VRSRA */
  4266 + if (u)
  4267 + gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, cpu_V1);
4262 4268 else
4263   - imm = (uint16_t)(0xffff << shift);
4264   - imm |= imm << 16;
  4269 + gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, cpu_V1);
4265 4270 break;
4266   - case 2:
4267   - if (op == 4)
4268   - imm = 0xffffffffu >> -shift;
  4271 + case 4: /* VSRI */
  4272 + if (!u)
  4273 + return 1;
  4274 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, cpu_V1);
  4275 + break;
  4276 + case 5: /* VSHL, VSLI */
  4277 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, cpu_V1);
  4278 + break;
  4279 + case 6: /* VQSHL */
  4280 + if (u)
  4281 + gen_helper_neon_qshl_u64(cpu_V0, cpu_env, cpu_V0, cpu_V1);
4269 4282 else
4270   - imm = 0xffffffffu << shift;
  4283 + gen_helper_neon_qshl_s64(cpu_V0, cpu_env, cpu_V0, cpu_V1);
  4284 + break;
  4285 + case 7: /* VQSHLU */
  4286 + gen_helper_neon_qshl_u64(cpu_V0, cpu_env, cpu_V0, cpu_V1);
4271 4287 break;
4272   - default:
4273   - abort();
4274 4288 }
4275   - tmp = neon_load_reg(rd, pass);
4276   - tcg_gen_andi_i32(cpu_T[0], cpu_T[0], imm);
4277   - tcg_gen_andi_i32(tmp, tmp, ~imm);
4278   - tcg_gen_or_i32(cpu_T[0], cpu_T[0], tmp);
4279   - }
4280   - if (size == 3) {
4281   - NEON_SET_REG(T0, rd, pass * 2);
4282   - NEON_SET_REG(T1, rd, pass * 2 + 1);
4283   - } else {
  4289 + if (op == 1 || op == 3) {
  4290 + /* Accumulate. */
  4291 + neon_load_reg64(cpu_V0, rd + pass);
  4292 + tcg_gen_add_i64(cpu_V0, cpu_V0, cpu_V1);
  4293 + } else if (op == 4 || (op == 5 && u)) {
  4294 + /* Insert */
  4295 + cpu_abort(env, "VS[LR]I.64 not implemented");
  4296 + }
  4297 + neon_store_reg64(cpu_V0, rd + pass);
  4298 + } else { /* size < 3 */
  4299 + /* Operands in T0 and T1. */
  4300 + gen_op_movl_T1_im(imm);
  4301 + NEON_GET_REG(T0, rm, pass);
  4302 + switch (op) {
  4303 + case 0: /* VSHR */
  4304 + case 1: /* VSRA */
  4305 + GEN_NEON_INTEGER_OP(shl);
  4306 + break;
  4307 + case 2: /* VRSHR */
  4308 + case 3: /* VRSRA */
  4309 + GEN_NEON_INTEGER_OP(rshl);
  4310 + break;
  4311 + case 4: /* VSRI */
  4312 + if (!u)
  4313 + return 1;
  4314 + GEN_NEON_INTEGER_OP(shl);
  4315 + break;
  4316 + case 5: /* VSHL, VSLI */
  4317 + switch (size) {
  4318 + case 0: gen_helper_neon_shl_u8(CPU_T001); break;
  4319 + case 1: gen_helper_neon_shl_u16(CPU_T001); break;
  4320 + case 2: gen_helper_neon_shl_u32(CPU_T001); break;
  4321 + default: return 1;
  4322 + }
  4323 + break;
  4324 + case 6: /* VQSHL */
  4325 + GEN_NEON_INTEGER_OP_ENV(qshl);
  4326 + break;
  4327 + case 7: /* VQSHLU */
  4328 + switch (size) {
  4329 + case 0: gen_helper_neon_qshl_u8(CPU_T0E01); break;
  4330 + case 1: gen_helper_neon_qshl_u16(CPU_T0E01); break;
  4331 + case 2: gen_helper_neon_qshl_u32(CPU_T0E01); break;
  4332 + default: return 1;
  4333 + }
  4334 + break;
  4335 + }
  4336 +
  4337 + if (op == 1 || op == 3) {
  4338 + /* Accumulate. */
  4339 + NEON_GET_REG(T1, rd, pass);
  4340 + gen_neon_add(size);
  4341 + } else if (op == 4 || (op == 5 && u)) {
  4342 + /* Insert */
  4343 + switch (size) {
  4344 + case 0:
  4345 + if (op == 4)
  4346 + imm = 0xff >> -shift;
  4347 + else
  4348 + imm = (uint8_t)(0xff << shift);
  4349 + imm |= imm << 8;
  4350 + imm |= imm << 16;
  4351 + break;
  4352 + case 1:
  4353 + if (op == 4)
  4354 + imm = 0xffff >> -shift;
  4355 + else
  4356 + imm = (uint16_t)(0xffff << shift);
  4357 + imm |= imm << 16;
  4358 + break;
  4359 + case 2:
  4360 + if (op == 4)
  4361 + imm = 0xffffffffu >> -shift;
  4362 + else
  4363 + imm = 0xffffffffu << shift;
  4364 + break;
  4365 + default:
  4366 + abort();
  4367 + }
  4368 + tmp = neon_load_reg(rd, pass);
  4369 + tcg_gen_andi_i32(cpu_T[0], cpu_T[0], imm);
  4370 + tcg_gen_andi_i32(tmp, tmp, ~imm);
  4371 + tcg_gen_or_i32(cpu_T[0], cpu_T[0], tmp);
  4372 + }
4284 4373 NEON_SET_REG(T0, rd, pass);
4285 4374 }
4286 4375 } /* for pass */
4287 4376 } else if (op < 10) {
4288   - /* Shift by immedaiate and narrow:
  4377 + /* Shift by immediate and narrow:
4289 4378 VSHRN, VRSHRN, VQSHRN, VQRSHRN. */
4290 4379 shift = shift - (1 << (size + 3));
4291 4380 size++;
4292   - if (size == 3) {
4293   - count = q + 1;
4294   - } else {
4295   - count = q ? 4: 2;
4296   - }
4297 4381 switch (size) {
4298 4382 case 1:
4299   - imm = (uint16_t) shift;
  4383 + imm = (uint16_t)shift;
4300 4384 imm |= imm << 16;
  4385 + tmp2 = tcg_const_i32(imm);
4301 4386 break;
4302 4387 case 2:
  4388 + imm = (uint32_t)shift;
  4389 + tmp2 = tcg_const_i32(imm);
4303 4390 case 3:
4304   - imm = shift;
  4391 + tmp2 = tcg_const_i64(shift);
4305 4392 break;
4306 4393 default:
4307 4394 abort();
4308 4395 }
4309 4396  
4310   - /* Processing MSB first means we need to do less shuffling at
4311   - the end. */
4312   - for (pass = count - 1; pass >= 0; pass--) {
4313   - /* Avoid clobbering the second operand before it has been
4314   - written. */
4315   - n = pass;
4316   - if (rd == rm)
4317   - n ^= (count - 1);
4318   - else
4319   - n = pass;
4320   -
4321   - if (size < 3) {
4322   - /* Operands in T0 and T1. */
4323   - gen_op_movl_T1_im(imm);
4324   - NEON_GET_REG(T0, rm, n);
  4397 + for (pass = 0; pass < 2; pass++) {
  4398 + if (size == 3) {
  4399 + neon_load_reg64(cpu_V0, rm + pass);
  4400 + if (q) {
  4401 + if (u)
  4402 + gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp2);
  4403 + else
  4404 + gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp2);
  4405 + } else {
  4406 + if (u)
  4407 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp2);
  4408 + else
  4409 + gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp2);
  4410 + }
4325 4411 } else {
4326   - /* Operands in {T0, T1} and env->vfp.scratch. */
4327   - gen_op_movl_T0_im(imm);
4328   - gen_neon_movl_scratch_T0(0);
4329   - gen_op_movl_T0_im((int32_t)imm >> 31);
4330   - gen_neon_movl_scratch_T0(1);
4331   - NEON_GET_REG(T0, rm, n * 2);
4332   - NEON_GET_REG(T0, rm, n * 2 + 1);
  4412 + tmp = neon_load_reg(rm + pass, 0);
  4413 + gen_neon_shift_narrow(size, tmp, tmp2, q, u);
  4414 + tcg_gen_extu_i32_i64(cpu_V0, tmp);
  4415 + dead_tmp(tmp);
  4416 + tmp = neon_load_reg(rm + pass, 1);
  4417 + gen_neon_shift_narrow(size, tmp, tmp2, q, u);
  4418 + tcg_gen_extu_i32_i64(cpu_V1, tmp);
  4419 + dead_tmp(tmp);
  4420 + tcg_gen_shli_i64(cpu_V1, cpu_V1, 32);
  4421 + tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
4333 4422 }
4334   -
4335   - gen_neon_shift_im_narrow[q][u][size - 1]();
4336   -
4337   - if (size < 3 && (pass & 1) == 0) {
4338   - gen_neon_movl_scratch_T0(0);
  4423 + tmp = new_tmp();
  4424 + if (op == 8 && !u) {
  4425 + gen_neon_narrow(size - 1, tmp, cpu_V0);
4339 4426 } else {
4340   - uint32_t offset;
4341   -
4342   - if (size < 3)
4343   - gen_neon_movl_T1_scratch(0);
4344   -
4345   - if (op == 8 && !u) {
4346   - gen_neon_narrow[size - 1]();
4347   - } else {
4348   - if (op == 8)
4349   - gen_neon_narrow_sats[size - 2]();
4350   - else
4351   - gen_neon_narrow_satu[size - 1]();
4352   - }
4353   - if (size == 3)
4354   - offset = neon_reg_offset(rd, n);
  4427 + if (op == 8)
  4428 + gen_neon_narrow_sats(size - 1, tmp, cpu_V0);
4355 4429 else
4356   - offset = neon_reg_offset(rd, n >> 1);
4357   - gen_op_neon_setreg_T0(offset);
  4430 + gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
  4431 + }
  4432 + if (pass == 0) {
  4433 + tmp2 = tmp;
  4434 + } else {
  4435 + neon_store_reg(rd, 0, tmp2);
  4436 + neon_store_reg(rd, 1, tmp);
4358 4437 }
4359 4438 } /* for pass */
4360 4439 } else if (op == 10) {
4361 4440 /* VSHLL */
4362   - if (q)
  4441 + if (q || size == 3)
4363 4442 return 1;
  4443 + tmp = neon_load_reg(rm, 0);
  4444 + tmp2 = neon_load_reg(rm, 1);
4364 4445 for (pass = 0; pass < 2; pass++) {
4365   - /* Avoid clobbering the input operand. */
4366   - if (rd == rm)
4367   - n = 1 - pass;
4368   - else
4369   - n = pass;
  4446 + if (pass == 1)
  4447 + tmp = tmp2;
  4448 +
  4449 + gen_neon_widen(cpu_V0, tmp, size, u);
4370 4450  
4371   - NEON_GET_REG(T0, rm, n);
4372   - GEN_NEON_INTEGER_OP(widen);
4373 4451 if (shift != 0) {
4374 4452 /* The shift is less than the width of the source
4375   - type, so in some cases we can just
4376   - shift the whole register. */
4377   - if (size == 1 || (size == 0 && u)) {
4378   - gen_op_shll_T0_im(shift);
4379   - gen_op_shll_T1_im(shift);
4380   - } else {
4381   - switch (size) {
4382   - case 0: gen_op_neon_shll_u16(shift); break;
4383   - case 2: gen_op_neon_shll_u64(shift); break;
4384   - default: abort();
  4453 + type, so we can just shift the whole register. */
  4454 + tcg_gen_shli_i64(cpu_V0, cpu_V0, shift);
  4455 + if (size < 2 || !u) {
  4456 + uint64_t imm64;
  4457 + if (size == 0) {
  4458 + imm = (0xffu >> (8 - shift));
  4459 + imm |= imm << 16;
  4460 + } else {
  4461 + imm = 0xffff >> (16 - shift);
4385 4462 }
  4463 + imm64 = imm | (((uint64_t)imm) << 32);
  4464 + tcg_gen_andi_i64(cpu_V0, cpu_V0, imm64);
4386 4465 }
4387 4466 }
4388   - NEON_SET_REG(T0, rd, n * 2);
4389   - NEON_SET_REG(T1, rd, n * 2 + 1);
  4467 + neon_store_reg64(cpu_V0, rd + pass);
4390 4468 }
4391 4469 } else if (op == 15 || op == 16) {
4392 4470 /* VCVT fixed-point. */
... ... @@ -4458,28 +4536,30 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4458 4536  
4459 4537 for (pass = 0; pass < (q ? 4 : 2); pass++) {
4460 4538 if (op & 1 && op < 12) {
4461   - NEON_GET_REG(T0, rd, pass);
  4539 + tmp = neon_load_reg(rd, pass);
4462 4540 if (invert) {
4463 4541 /* The immediate value has already been inverted, so
4464 4542 BIC becomes AND. */
4465   - gen_op_andl_T0_T1();
  4543 + tcg_gen_andi_i32(tmp, tmp, imm);
4466 4544 } else {
4467   - gen_op_orl_T0_T1();
  4545 + tcg_gen_ori_i32(tmp, tmp, imm);
4468 4546 }
4469   - NEON_SET_REG(T0, rd, pass);
4470 4547 } else {
  4548 + /* VMOV, VMVN. */
  4549 + tmp = new_tmp();
4471 4550 if (op == 14 && invert) {
4472   - uint32_t tmp;
4473   - tmp = 0;
  4551 + uint32_t val;
  4552 + val = 0;
4474 4553 for (n = 0; n < 4; n++) {
4475 4554 if (imm & (1 << (n + (pass & 1) * 4)))
4476   - tmp |= 0xff << (n * 8);
  4555 + val |= 0xff << (n * 8);
4477 4556 }
4478   - gen_op_movl_T1_im(tmp);
  4557 + tcg_gen_movi_i32(tmp, val);
  4558 + } else {
  4559 + tcg_gen_movi_i32(tmp, imm);
4479 4560 }
4480   - /* VMOV, VMVN. */
4481   - NEON_SET_REG(T1, rd, pass);
4482 4561 }
  4562 + neon_store_reg(rd, pass, tmp);
4483 4563 }
4484 4564 }
4485 4565 } else { /* (insn & 0x00800010 == 0x00800010) */
... ... @@ -4513,6 +4593,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4513 4593 src1_wide = neon_3reg_wide[op][1];
4514 4594 src2_wide = neon_3reg_wide[op][2];
4515 4595  
  4596 + if (size == 0 && (op == 9 || op == 11 || op == 13))
  4597 + return 1;
  4598 +
4516 4599 /* Avoid overlapping operands. Wide source operands are
4517 4600 always aligned so will never overlap with wide
4518 4601 destinations in problematic ways. */
... ... @@ -4524,87 +4607,69 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4524 4607 gen_neon_movl_scratch_T0(2);
4525 4608 }
4526 4609 for (pass = 0; pass < 2; pass++) {
4527   - /* Load the second operand into env->vfp.scratch.
4528   - Also widen narrow operands. */
4529   - if (src2_wide) {
4530   - NEON_GET_REG(T0, rm, pass * 2);
4531   - NEON_GET_REG(T1, rm, pass * 2 + 1);
  4610 + if (src1_wide) {
  4611 + neon_load_reg64(cpu_V0, rn + pass);
4532 4612 } else {
4533   - if (pass == 1 && rd == rm) {
4534   - if (prewiden) {
4535   - gen_neon_movl_T0_scratch(2);
4536   - } else {
4537   - gen_neon_movl_T1_scratch(2);
4538   - }
  4613 + if (pass == 1 && rd == rn) {
  4614 + gen_neon_movl_T0_scratch(2);
  4615 + tmp = new_tmp();
  4616 + tcg_gen_mov_i32(tmp, cpu_T[0]);
4539 4617 } else {
4540   - if (prewiden) {
4541   - NEON_GET_REG(T0, rm, pass);
4542   - } else {
4543   - NEON_GET_REG(T1, rm, pass);
4544   - }
  4618 + tmp = neon_load_reg(rn, pass);
  4619 + }
  4620 + if (prewiden) {
  4621 + gen_neon_widen(cpu_V0, tmp, size, u);
4545 4622 }
4546 4623 }
4547   - if (prewiden && !src2_wide) {
4548   - GEN_NEON_INTEGER_OP(widen);
4549   - }
4550   - if (prewiden || src2_wide) {
4551   - gen_neon_movl_scratch_T0(0);
4552   - gen_neon_movl_scratch_T1(1);
4553   - }
4554   -
4555   - /* Load the first operand. */
4556   - if (src1_wide) {
4557   - NEON_GET_REG(T0, rn, pass * 2);
4558   - NEON_GET_REG(T1, rn, pass * 2 + 1);
  4624 + if (src2_wide) {
  4625 + neon_load_reg64(cpu_V1, rm + pass);
4559 4626 } else {
4560   - if (pass == 1 && rd == rn) {
  4627 + if (pass == 1 && rd == rm) {
4561 4628 gen_neon_movl_T0_scratch(2);
  4629 + tmp2 = new_tmp();
  4630 + tcg_gen_mov_i32(tmp2, cpu_T[0]);
4562 4631 } else {
4563   - NEON_GET_REG(T0, rn, pass);
  4632 + tmp2 = neon_load_reg(rm, pass);
  4633 + }
  4634 + if (prewiden) {
  4635 + gen_neon_widen(cpu_V1, tmp2, size, u);
4564 4636 }
4565   - }
4566   - if (prewiden && !src1_wide) {
4567   - GEN_NEON_INTEGER_OP(widen);
4568 4637 }
4569 4638 switch (op) {
4570 4639 case 0: case 1: case 4: /* VADDL, VADDW, VADDHN, VRADDHN */
4571   - switch (size) {
4572   - case 0: gen_op_neon_addl_u16(); break;
4573   - case 1: gen_op_neon_addl_u32(); break;
4574   - case 2: gen_op_neon_addl_u64(); break;
4575   - default: abort();
4576   - }
  4640 + gen_neon_addl(size);
4577 4641 break;
4578 4642 case 2: case 3: case 6: /* VSUBL, VSUBW, VSUBHL, VRSUBHL */
4579   - switch (size) {
4580   - case 0: gen_op_neon_subl_u16(); break;
4581   - case 1: gen_op_neon_subl_u32(); break;
4582   - case 2: gen_op_neon_subl_u64(); break;
4583   - default: abort();
4584   - }
  4643 + gen_neon_subl(size);
4585 4644 break;
4586 4645 case 5: case 7: /* VABAL, VABDL */
4587 4646 switch ((size << 1) | u) {
4588   - case 0: gen_op_neon_abdl_s16(); break;
4589   - case 1: gen_op_neon_abdl_u16(); break;
4590   - case 2: gen_op_neon_abdl_s32(); break;
4591   - case 3: gen_op_neon_abdl_u32(); break;
4592   - case 4: gen_op_neon_abdl_s64(); break;
4593   - case 5: gen_op_neon_abdl_u64(); break;
  4647 + case 0:
  4648 + gen_helper_neon_abdl_s16(cpu_V0, tmp, tmp2);
  4649 + break;
  4650 + case 1:
  4651 + gen_helper_neon_abdl_u16(cpu_V0, tmp, tmp2);
  4652 + break;
  4653 + case 2:
  4654 + gen_helper_neon_abdl_s32(cpu_V0, tmp, tmp2);
  4655 + break;
  4656 + case 3:
  4657 + gen_helper_neon_abdl_u32(cpu_V0, tmp, tmp2);
  4658 + break;
  4659 + case 4:
  4660 + gen_helper_neon_abdl_s64(cpu_V0, tmp, tmp2);
  4661 + break;
  4662 + case 5:
  4663 + gen_helper_neon_abdl_u64(cpu_V0, tmp, tmp2);
  4664 + break;
4594 4665 default: abort();
4595 4666 }
  4667 + dead_tmp(tmp2);
  4668 + dead_tmp(tmp);
4596 4669 break;
4597 4670 case 8: case 9: case 10: case 11: case 12: case 13:
4598 4671 /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */
4599   - switch ((size << 1) | u) {
4600   - case 0: gen_op_neon_mull_s8(); break;
4601   - case 1: gen_op_neon_mull_u8(); break;
4602   - case 2: gen_op_neon_mull_s16(); break;
4603   - case 3: gen_op_neon_mull_u16(); break;
4604   - case 4: gen_op_imull_T0_T1(); break;
4605   - case 5: gen_op_mull_T0_T1(); break;
4606   - default: abort();
4607   - }
  4672 + gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
4608 4673 break;
4609 4674 case 14: /* Polynomial VMULL */
4610 4675 cpu_abort(env, "Polynomial VMULL not implemented");
... ... @@ -4615,72 +4680,71 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4615 4680 if (op == 5 || op == 13 || (op >= 8 && op <= 11)) {
4616 4681 /* Accumulate. */
4617 4682 if (op == 10 || op == 11) {
4618   - switch (size) {
4619   - case 0: gen_op_neon_negl_u16(); break;
4620   - case 1: gen_op_neon_negl_u32(); break;
4621   - case 2: gen_op_neon_negl_u64(); break;
4622   - default: abort();
4623   - }
  4683 + gen_neon_negl(cpu_V0, size);
4624 4684 }
4625 4685  
4626   - gen_neon_movl_scratch_T0(0);
4627   - gen_neon_movl_scratch_T1(1);
4628   -
4629 4686 if (op != 13) {
4630   - NEON_GET_REG(T0, rd, pass * 2);
4631   - NEON_GET_REG(T1, rd, pass * 2 + 1);
  4687 + neon_load_reg64(cpu_V1, rd + pass);
4632 4688 }
4633 4689  
4634 4690 switch (op) {
4635 4691 case 5: case 8: case 10: /* VABAL, VMLAL, VMLSL */
4636   - switch (size) {
4637   - case 0: gen_op_neon_addl_u16(); break;
4638   - case 1: gen_op_neon_addl_u32(); break;
4639   - case 2: gen_op_neon_addl_u64(); break;
4640   - default: abort();
4641   - }
  4692 + gen_neon_addl(size);
4642 4693 break;
4643 4694 case 9: case 11: /* VQDMLAL, VQDMLSL */
4644   - switch (size) {
4645   - case 1: gen_op_neon_addl_saturate_s32(); break;
4646   - case 2: gen_op_neon_addl_saturate_s64(); break;
4647   - default: abort();
4648   - }
  4695 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
  4696 + gen_neon_addl_saturate(cpu_V0, cpu_V1, size);
  4697 + break;
4649 4698 /* Fall through. */
4650 4699 case 13: /* VQDMULL */
4651   - switch (size) {
4652   - case 1: gen_op_neon_addl_saturate_s32(); break;
4653   - case 2: gen_op_neon_addl_saturate_s64(); break;
4654   - default: abort();
4655   - }
  4700 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
4656 4701 break;
4657 4702 default:
4658 4703 abort();
4659 4704 }
4660   - NEON_SET_REG(T0, rd, pass * 2);
4661   - NEON_SET_REG(T1, rd, pass * 2 + 1);
  4705 + neon_store_reg64(cpu_V0, rd + pass);
4662 4706 } else if (op == 4 || op == 6) {
4663 4707 /* Narrowing operation. */
  4708 + tmp = new_tmp();
4664 4709 if (u) {
4665 4710 switch (size) {
4666   - case 0: gen_op_neon_narrow_high_u8(); break;
4667   - case 1: gen_op_neon_narrow_high_u16(); break;
4668   - case 2: gen_op_movl_T0_T1(); break;
  4711 + case 0:
  4712 + gen_helper_neon_narrow_high_u8(tmp, cpu_V0);
  4713 + break;
  4714 + case 1:
  4715 + gen_helper_neon_narrow_high_u16(tmp, cpu_V0);
  4716 + break;
  4717 + case 2:
  4718 + tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
  4719 + tcg_gen_trunc_i64_i32(tmp, cpu_V0);
  4720 + break;
4669 4721 default: abort();
4670 4722 }
4671 4723 } else {
4672 4724 switch (size) {
4673   - case 0: gen_op_neon_narrow_high_round_u8(); break;
4674   - case 1: gen_op_neon_narrow_high_round_u16(); break;
4675   - case 2: gen_op_neon_narrow_high_round_u32(); break;
  4725 + case 0:
  4726 + gen_helper_neon_narrow_round_high_u8(tmp, cpu_V0);
  4727 + break;
  4728 + case 1:
  4729 + gen_helper_neon_narrow_round_high_u16(tmp, cpu_V0);
  4730 + break;
  4731 + case 2:
  4732 + tcg_gen_addi_i64(cpu_V0, cpu_V0, 1u << 31);
  4733 + tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
  4734 + tcg_gen_trunc_i64_i32(tmp, cpu_V0);
  4735 + break;
4676 4736 default: abort();
4677 4737 }
4678 4738 }
4679   - NEON_SET_REG(T0, rd, pass);
  4739 + if (pass == 0) {
  4740 + tmp3 = tmp;
  4741 + } else {
  4742 + neon_store_reg(rd, 0, tmp3);
  4743 + neon_store_reg(rd, 1, tmp);
  4744 + }
4680 4745 } else {
4681 4746 /* Write back the result. */
4682   - NEON_SET_REG(T0, rd, pass * 2);
4683   - NEON_SET_REG(T1, rd, pass * 2 + 1);
  4747 + neon_store_reg64(cpu_V0, rd + pass);
4684 4748 }
4685 4749 }
4686 4750 } else {
... ... @@ -4702,22 +4766,22 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4702 4766 NEON_GET_REG(T1, rn, pass);
4703 4767 if (op == 12) {
4704 4768 if (size == 1) {
4705   - gen_op_neon_qdmulh_s16();
  4769 + gen_helper_neon_qdmulh_s16(CPU_T0E01);
4706 4770 } else {
4707   - gen_op_neon_qdmulh_s32();
  4771 + gen_helper_neon_qdmulh_s32(CPU_T0E01);
4708 4772 }
4709 4773 } else if (op == 13) {
4710 4774 if (size == 1) {
4711   - gen_op_neon_qrdmulh_s16();
  4775 + gen_helper_neon_qrdmulh_s16(CPU_T0E01);
4712 4776 } else {
4713   - gen_op_neon_qrdmulh_s32();
  4777 + gen_helper_neon_qrdmulh_s32(CPU_T0E01);
4714 4778 }
4715 4779 } else if (op & 1) {
4716   - gen_op_neon_mul_f32();
  4780 + gen_helper_neon_mul_f32(CPU_T001);
4717 4781 } else {
4718 4782 switch (size) {
4719   - case 0: gen_op_neon_mul_u8(); break;
4720   - case 1: gen_op_neon_mul_u16(); break;
  4783 + case 0: gen_helper_neon_mul_u8(CPU_T001); break;
  4784 + case 1: gen_helper_neon_mul_u16(CPU_T001); break;
4721 4785 case 2: gen_op_mul_T0_T1(); break;
4722 4786 default: return 1;
4723 4787 }
... ... @@ -4730,18 +4794,13 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4730 4794 gen_neon_add(size);
4731 4795 break;
4732 4796 case 1:
4733   - gen_op_neon_add_f32();
  4797 + gen_helper_neon_add_f32(CPU_T001);
4734 4798 break;
4735 4799 case 4:
4736   - switch (size) {
4737   - case 0: gen_op_neon_rsb_u8(); break;
4738   - case 1: gen_op_neon_rsb_u16(); break;
4739   - case 2: gen_op_rsbl_T0_T1(); break;
4740   - default: return 1;
4741   - }
  4800 + gen_neon_rsb(size);
4742 4801 break;
4743 4802 case 5:
4744   - gen_op_neon_rsb_f32();
  4803 + gen_helper_neon_sub_f32(cpu_T[0], cpu_T[1], cpu_T[0]);
4745 4804 break;
4746 4805 default:
4747 4806 abort();
... ... @@ -4756,81 +4815,46 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4756 4815 case 7: /* VQDMLSL scalar */
4757 4816 case 10: /* VMULL scalar */
4758 4817 case 11: /* VQDMULL scalar */
4759   - if (rd == rn) {
4760   - /* Save overlapping operands before they are
4761   - clobbered. */
4762   - NEON_GET_REG(T0, rn, 1);
4763   - gen_neon_movl_scratch_T0(2);
4764   - }
  4818 + if (size == 0 && (op == 3 || op == 7 || op == 11))
  4819 + return 1;
  4820 +
4765 4821 gen_neon_get_scalar(size, rm);
4766   - gen_neon_movl_scratch_T0(3);
  4822 + NEON_GET_REG(T1, rn, 1);
  4823 +
4767 4824 for (pass = 0; pass < 2; pass++) {
4768   - if (pass != 0) {
4769   - gen_neon_movl_T0_scratch(3);
4770   - }
4771   - if (pass != 0 && rd == rn) {
4772   - gen_neon_movl_T1_scratch(2);
  4825 + if (pass == 0) {
  4826 + tmp = neon_load_reg(rn, 0);
4773 4827 } else {
4774   - NEON_GET_REG(T1, rn, pass);
4775   - }
4776   - switch ((size << 1) | u) {
4777   - case 0: gen_op_neon_mull_s8(); break;
4778   - case 1: gen_op_neon_mull_u8(); break;
4779   - case 2: gen_op_neon_mull_s16(); break;
4780   - case 3: gen_op_neon_mull_u16(); break;
4781   - case 4: gen_op_imull_T0_T1(); break;
4782   - case 5: gen_op_mull_T0_T1(); break;
4783   - default: abort();
  4828 + tmp = new_tmp();
  4829 + tcg_gen_mov_i32(tmp, cpu_T[1]);
4784 4830 }
  4831 + tmp2 = new_tmp();
  4832 + tcg_gen_mov_i32(tmp2, cpu_T[0]);
  4833 + gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
4785 4834 if (op == 6 || op == 7) {
4786   - switch (size) {
4787   - case 0: gen_op_neon_negl_u16(); break;
4788   - case 1: gen_op_neon_negl_u32(); break;
4789   - case 2: gen_op_neon_negl_u64(); break;
4790   - default: abort();
4791   - }
  4835 + gen_neon_negl(cpu_V0, size);
  4836 + }
  4837 + if (op != 11) {
  4838 + neon_load_reg64(cpu_V1, rd + pass);
4792 4839 }
4793   - gen_neon_movl_scratch_T0(0);
4794   - gen_neon_movl_scratch_T1(1);
4795   - NEON_GET_REG(T0, rd, pass * 2);
4796   - NEON_GET_REG(T1, rd, pass * 2 + 1);
4797 4840 switch (op) {
4798 4841 case 2: case 6:
4799   - switch (size) {
4800   - case 0: gen_op_neon_addl_u16(); break;
4801   - case 1: gen_op_neon_addl_u32(); break;
4802   - case 2: gen_op_neon_addl_u64(); break;
4803   - default: abort();
4804   - }
  4842 + gen_neon_addl(size);
4805 4843 break;
4806 4844 case 3: case 7:
4807   - switch (size) {
4808   - case 1:
4809   - gen_op_neon_addl_saturate_s32();
4810   - gen_op_neon_addl_saturate_s32();
4811   - break;
4812   - case 2:
4813   - gen_op_neon_addl_saturate_s64();
4814   - gen_op_neon_addl_saturate_s64();
4815   - break;
4816   - default: abort();
4817   - }
  4845 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
  4846 + gen_neon_addl_saturate(cpu_V0, cpu_V1, size);
4818 4847 break;
4819 4848 case 10:
4820 4849 /* no-op */
4821 4850 break;
4822 4851 case 11:
4823   - switch (size) {
4824   - case 1: gen_op_neon_addl_saturate_s32(); break;
4825   - case 2: gen_op_neon_addl_saturate_s64(); break;
4826   - default: abort();
4827   - }
  4852 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
4828 4853 break;
4829 4854 default:
4830 4855 abort();
4831 4856 }
4832   - NEON_SET_REG(T0, rd, pass * 2);
4833   - NEON_SET_REG(T1, rd, pass * 2 + 1);
  4857 + neon_store_reg64(cpu_V0, rd + pass);
4834 4858 }
4835 4859 break;
4836 4860 default: /* 14 and 15 are RESERVED */
... ... @@ -4840,29 +4864,53 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4840 4864 } else { /* size == 3 */
4841 4865 if (!u) {
4842 4866 /* Extract. */
4843   - int reg;
4844 4867 imm = (insn >> 8) & 0xf;
4845   - reg = rn;
4846   - count = q ? 4 : 2;
4847   - n = imm >> 2;
4848   - NEON_GET_REG(T0, reg, n);
4849   - for (pass = 0; pass < count; pass++) {
4850   - n++;
4851   - if (n > count) {
4852   - reg = rm;
4853   - n -= count;
  4868 + count = q + 1;
  4869 +
  4870 + if (imm > 7 && !q)
  4871 + return 1;
  4872 +
  4873 + if (imm == 0) {
  4874 + neon_load_reg64(cpu_V0, rn);
  4875 + if (q) {
  4876 + neon_load_reg64(cpu_V1, rn + 1);
4854 4877 }
4855   - if (imm & 3) {
4856   - NEON_GET_REG(T1, reg, n);
4857   - gen_op_neon_extract((insn << 3) & 0x1f);
  4878 + } else if (imm == 8) {
  4879 + neon_load_reg64(cpu_V0, rn + 1);
  4880 + if (q) {
  4881 + neon_load_reg64(cpu_V1, rm);
4858 4882 }
4859   - /* ??? This is broken if rd and rm overlap */
4860   - NEON_SET_REG(T0, rd, pass);
4861   - if (imm & 3) {
4862   - gen_op_movl_T0_T1();
  4883 + } else if (q) {
  4884 + tmp = tcg_temp_new(TCG_TYPE_I64);
  4885 + if (imm < 8) {
  4886 + neon_load_reg64(cpu_V0, rn);
  4887 + neon_load_reg64(tmp, rn + 1);
  4888 + } else {
  4889 + neon_load_reg64(cpu_V0, rn + 1);
  4890 + neon_load_reg64(tmp, rm);
  4891 + }
  4892 + tcg_gen_shri_i64(cpu_V0, cpu_V0, (imm & 7) * 8);
  4893 + tcg_gen_shli_i64(cpu_V1, tmp, 64 - ((imm & 7) * 8));
  4894 + tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
  4895 + if (imm < 8) {
  4896 + neon_load_reg64(cpu_V1, rm);
4863 4897 } else {
4864   - NEON_GET_REG(T0, reg, n);
  4898 + neon_load_reg64(cpu_V1, rm + 1);
  4899 + imm -= 8;
4865 4900 }
  4901 + tcg_gen_shli_i64(cpu_V1, cpu_V1, 64 - (imm * 8));
  4902 + tcg_gen_shri_i64(tmp, tmp, imm * 8);
  4903 + tcg_gen_or_i64(cpu_V1, cpu_V1, tmp);
  4904 + } else {
  4905 + neon_load_reg64(cpu_V0, rn);
  4906 + tcg_gen_shri_i32(cpu_V0, cpu_V0, imm * 8);
  4907 + neon_load_reg64(cpu_V1, rm);
  4908 + tcg_gen_shli_i32(cpu_V1, cpu_V1, 64 - (imm * 8));
  4909 + tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
  4910 + }
  4911 + neon_store_reg64(cpu_V0, rd);
  4912 + if (q) {
  4913 + neon_store_reg64(cpu_V1, rd + 1);
4866 4914 }
4867 4915 } else if ((insn & (1 << 11)) == 0) {
4868 4916 /* Two register misc. */
... ... @@ -4897,28 +4945,25 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4897 4945 break;
4898 4946 case 4: case 5: /* VPADDL */
4899 4947 case 12: case 13: /* VPADAL */
4900   - if (size < 2)
4901   - goto elementwise;
4902 4948 if (size == 3)
4903 4949 return 1;
4904   - for (pass = 0; pass < (q ? 2 : 1); pass++) {
4905   - NEON_GET_REG(T0, rm, pass * 2);
4906   - NEON_GET_REG(T1, rm, pass * 2 + 1);
4907   - if (op & 1)
4908   - gen_op_neon_paddl_u32();
4909   - else
4910   - gen_op_neon_paddl_s32();
  4950 + for (pass = 0; pass < q + 1; pass++) {
  4951 + tmp = neon_load_reg(rm, pass * 2);
  4952 + gen_neon_widen(cpu_V0, tmp, size, op & 1);
  4953 + tmp = neon_load_reg(rm, pass * 2 + 1);
  4954 + gen_neon_widen(cpu_V1, tmp, size, op & 1);
  4955 + switch (size) {
  4956 + case 0: gen_helper_neon_paddl_u16(CPU_V001); break;
  4957 + case 1: gen_helper_neon_paddl_u32(CPU_V001); break;
  4958 + case 2: tcg_gen_add_i64(CPU_V001); break;
  4959 + default: abort();
  4960 + }
4911 4961 if (op >= 12) {
4912 4962 /* Accumulate. */
4913   - gen_neon_movl_scratch_T0(0);
4914   - gen_neon_movl_scratch_T1(1);
4915   -
4916   - NEON_GET_REG(T0, rd, pass * 2);
4917   - NEON_GET_REG(T1, rd, pass * 2 + 1);
4918   - gen_op_neon_addl_u64();
  4963 + neon_load_reg64(cpu_V1, rd + pass);
  4964 + gen_neon_addl(size);
4919 4965 }
4920   - NEON_SET_REG(T0, rd, pass * 2);
4921   - NEON_SET_REG(T1, rd, pass * 2 + 1);
  4966 + neon_store_reg64(cpu_V0, rd + pass);
4922 4967 }
4923 4968 break;
4924 4969 case 33: /* VTRN */
... ... @@ -4972,8 +5017,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4972 5017 NEON_GET_REG(T0, rd, n);
4973 5018 NEON_GET_REG(T1, rd, n);
4974 5019 switch (size) {
4975   - case 0: gen_op_neon_zip_u8(); break;
4976   - case 1: gen_op_neon_zip_u16(); break;
  5020 + case 0: gen_helper_neon_zip_u8(); break;
  5021 + case 1: gen_helper_neon_zip_u16(); break;
4977 5022 case 2: /* no-op */; break;
4978 5023 default: abort();
4979 5024 }
... ... @@ -4987,63 +5032,36 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4987 5032 }
4988 5033 break;
4989 5034 case 36: case 37: /* VMOVN, VQMOVUN, VQMOVN */
  5035 + if (size == 3)
  5036 + return 1;
4990 5037 for (pass = 0; pass < 2; pass++) {
4991   - if (rd == rm + 1) {
4992   - n = 1 - pass;
4993   - } else {
4994   - n = pass;
4995   - }
4996   - NEON_GET_REG(T0, rm, n * 2);
4997   - NEON_GET_REG(T1, rm, n * 2 + 1);
  5038 + neon_load_reg64(cpu_V0, rm + pass);
  5039 + tmp = new_tmp();
4998 5040 if (op == 36 && q == 0) {
4999   - switch (size) {
5000   - case 0: gen_op_neon_narrow_u8(); break;
5001   - case 1: gen_op_neon_narrow_u16(); break;
5002   - case 2: /* no-op */ break;
5003   - default: return 1;
5004   - }
  5041 + gen_neon_narrow(size, tmp, cpu_V0);
5005 5042 } else if (q) {
5006   - switch (size) {
5007   - case 0: gen_op_neon_narrow_sat_u8(); break;
5008   - case 1: gen_op_neon_narrow_sat_u16(); break;
5009   - case 2: gen_op_neon_narrow_sat_u32(); break;
5010   - default: return 1;
5011   - }
  5043 + gen_neon_narrow_satu(size, tmp, cpu_V0);
5012 5044 } else {
5013   - switch (size) {
5014   - case 0: gen_op_neon_narrow_sat_s8(); break;
5015   - case 1: gen_op_neon_narrow_sat_s16(); break;
5016   - case 2: gen_op_neon_narrow_sat_s32(); break;
5017   - default: return 1;
5018   - }
  5045 + gen_neon_narrow_sats(size, tmp, cpu_V0);
  5046 + }
  5047 + if (pass == 0) {
  5048 + tmp2 = tmp;
  5049 + } else {
  5050 + neon_store_reg(rd, 0, tmp2);
  5051 + neon_store_reg(rd, 1, tmp);
5019 5052 }
5020   - NEON_SET_REG(T0, rd, n);
5021 5053 }
5022 5054 break;
5023 5055 case 38: /* VSHLL */
5024   - if (q)
  5056 + if (q || size == 3)
5025 5057 return 1;
5026   - if (rm == rd) {
5027   - NEON_GET_REG(T0, rm, 1);
5028   - gen_neon_movl_scratch_T0(0);
5029   - }
  5058 + tmp = neon_load_reg(rm, 0);
  5059 + tmp2 = neon_load_reg(rm, 1);
5030 5060 for (pass = 0; pass < 2; pass++) {
5031   - if (pass == 1 && rm == rd) {
5032   - gen_neon_movl_T0_scratch(0);
5033   - } else {
5034   - NEON_GET_REG(T0, rm, pass);
5035   - }
5036   - switch (size) {
5037   - case 0: gen_op_neon_widen_high_u8(); break;
5038   - case 1: gen_op_neon_widen_high_u16(); break;
5039   - case 2:
5040   - gen_op_movl_T1_T0();
5041   - gen_op_movl_T0_im(0);
5042   - break;
5043   - default: return 1;
5044   - }
5045   - NEON_SET_REG(T0, rd, pass * 2);
5046   - NEON_SET_REG(T1, rd, pass * 2 + 1);
  5061 + if (pass == 1)
  5062 + tmp = tmp2;
  5063 + gen_neon_widen(cpu_V0, tmp, size, 1);
  5064 + neon_store_reg64(cpu_V0, rd + pass);
5047 5065 }
5048 5066 break;
5049 5067 default:
... ... @@ -5068,37 +5086,18 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5068 5086 return 1;
5069 5087 gen_rev16(cpu_T[0]);
5070 5088 break;
5071   - case 4: case 5: /* VPADDL */
5072   - case 12: case 13: /* VPADAL */
5073   - switch ((size << 1) | (op & 1)) {
5074   - case 0: gen_op_neon_paddl_s8(); break;
5075   - case 1: gen_op_neon_paddl_u8(); break;
5076   - case 2: gen_op_neon_paddl_s16(); break;
5077   - case 3: gen_op_neon_paddl_u16(); break;
5078   - default: abort();
5079   - }
5080   - if (op >= 12) {
5081   - /* Accumulate */
5082   - NEON_GET_REG(T1, rd, pass);
5083   - switch (size) {
5084   - case 0: gen_op_neon_add_u16(); break;
5085   - case 1: gen_op_addl_T0_T1(); break;
5086   - default: abort();
5087   - }
5088   - }
5089   - break;
5090 5089 case 8: /* CLS */
5091 5090 switch (size) {
5092   - case 0: gen_op_neon_cls_s8(); break;
5093   - case 1: gen_op_neon_cls_s16(); break;
5094   - case 2: gen_op_neon_cls_s32(); break;
  5091 + case 0: gen_helper_neon_cls_s8(cpu_T[0], cpu_T[0]); break;
  5092 + case 1: gen_helper_neon_cls_s16(cpu_T[0], cpu_T[0]); break;
  5093 + case 2: gen_helper_neon_cls_s32(cpu_T[0], cpu_T[0]); break;
5095 5094 default: return 1;
5096 5095 }
5097 5096 break;
5098 5097 case 9: /* CLZ */
5099 5098 switch (size) {
5100   - case 0: gen_op_neon_clz_u8(); break;
5101   - case 1: gen_op_neon_clz_u16(); break;
  5099 + case 0: gen_helper_neon_clz_u8(cpu_T[0], cpu_T[0]); break;
  5100 + case 1: gen_helper_neon_clz_u16(cpu_T[0], cpu_T[0]); break;
5102 5101 case 2: gen_helper_clz(cpu_T[0], cpu_T[0]); break;
5103 5102 default: return 1;
5104 5103 }
... ... @@ -5106,7 +5105,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5106 5105 case 10: /* CNT */
5107 5106 if (size != 0)
5108 5107 return 1;
5109   - gen_op_neon_cnt_u8();
  5108 + gen_helper_neon_cnt_u8(cpu_T[0], cpu_T[0]);
5110 5109 break;
5111 5110 case 11: /* VNOT */
5112 5111 if (size != 0)
... ... @@ -5115,26 +5114,26 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5115 5114 break;
5116 5115 case 14: /* VQABS */
5117 5116 switch (size) {
5118   - case 0: gen_op_neon_qabs_s8(); break;
5119   - case 1: gen_op_neon_qabs_s16(); break;
5120   - case 2: gen_op_neon_qabs_s32(); break;
  5117 + case 0: gen_helper_neon_qabs_s8(cpu_T[0], cpu_env, cpu_T[0]); break;
  5118 + case 1: gen_helper_neon_qabs_s16(cpu_T[0], cpu_env, cpu_T[0]); break;
  5119 + case 2: gen_helper_neon_qabs_s32(cpu_T[0], cpu_env, cpu_T[0]); break;
5121 5120 default: return 1;
5122 5121 }
5123 5122 break;
5124 5123 case 15: /* VQNEG */
5125 5124 switch (size) {
5126   - case 0: gen_op_neon_qneg_s8(); break;
5127   - case 1: gen_op_neon_qneg_s16(); break;
5128   - case 2: gen_op_neon_qneg_s32(); break;
  5125 + case 0: gen_helper_neon_qneg_s8(cpu_T[0], cpu_env, cpu_T[0]); break;
  5126 + case 1: gen_helper_neon_qneg_s16(cpu_T[0], cpu_env, cpu_T[0]); break;
  5127 + case 2: gen_helper_neon_qneg_s32(cpu_T[0], cpu_env, cpu_T[0]); break;
5129 5128 default: return 1;
5130 5129 }
5131 5130 break;
5132 5131 case 16: case 19: /* VCGT #0, VCLE #0 */
5133 5132 gen_op_movl_T1_im(0);
5134 5133 switch(size) {
5135   - case 0: gen_op_neon_cgt_s8(); break;
5136   - case 1: gen_op_neon_cgt_s16(); break;
5137   - case 2: gen_op_neon_cgt_s32(); break;
  5134 + case 0: gen_helper_neon_cgt_s8(CPU_T001); break;
  5135 + case 1: gen_helper_neon_cgt_s16(CPU_T001); break;
  5136 + case 2: gen_helper_neon_cgt_s32(CPU_T001); break;
5138 5137 default: return 1;
5139 5138 }
5140 5139 if (op == 19)
... ... @@ -5143,9 +5142,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5143 5142 case 17: case 20: /* VCGE #0, VCLT #0 */
5144 5143 gen_op_movl_T1_im(0);
5145 5144 switch(size) {
5146   - case 0: gen_op_neon_cge_s8(); break;
5147   - case 1: gen_op_neon_cge_s16(); break;
5148   - case 2: gen_op_neon_cge_s32(); break;
  5145 + case 0: gen_helper_neon_cge_s8(CPU_T001); break;
  5146 + case 1: gen_helper_neon_cge_s16(CPU_T001); break;
  5147 + case 2: gen_helper_neon_cge_s32(CPU_T001); break;
5149 5148 default: return 1;
5150 5149 }
5151 5150 if (op == 20)
... ... @@ -5154,44 +5153,41 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5154 5153 case 18: /* VCEQ #0 */
5155 5154 gen_op_movl_T1_im(0);
5156 5155 switch(size) {
5157   - case 0: gen_op_neon_ceq_u8(); break;
5158   - case 1: gen_op_neon_ceq_u16(); break;
5159   - case 2: gen_op_neon_ceq_u32(); break;
  5156 + case 0: gen_helper_neon_ceq_u8(CPU_T001); break;
  5157 + case 1: gen_helper_neon_ceq_u16(CPU_T001); break;
  5158 + case 2: gen_helper_neon_ceq_u32(CPU_T001); break;
5160 5159 default: return 1;
5161 5160 }
5162 5161 break;
5163 5162 case 22: /* VABS */
5164 5163 switch(size) {
5165   - case 0: gen_op_neon_abs_s8(); break;
5166   - case 1: gen_op_neon_abs_s16(); break;
5167   - case 2: gen_op_neon_abs_s32(); break;
  5164 + case 0: gen_helper_neon_abs_s8(cpu_T[0], cpu_T[0]); break;
  5165 + case 1: gen_helper_neon_abs_s16(cpu_T[0], cpu_T[0]); break;
  5166 + case 2: tcg_gen_abs_i32(cpu_T[0], cpu_T[0]); break;
5168 5167 default: return 1;
5169 5168 }
5170 5169 break;
5171 5170 case 23: /* VNEG */
5172 5171 gen_op_movl_T1_im(0);
5173   - switch(size) {
5174   - case 0: gen_op_neon_rsb_u8(); break;
5175   - case 1: gen_op_neon_rsb_u16(); break;
5176   - case 2: gen_op_rsbl_T0_T1(); break;
5177   - default: return 1;
5178   - }
  5172 + if (size == 3)
  5173 + return 1;
  5174 + gen_neon_rsb(size);
5179 5175 break;
5180 5176 case 24: case 27: /* Float VCGT #0, Float VCLE #0 */
5181 5177 gen_op_movl_T1_im(0);
5182   - gen_op_neon_cgt_f32();
  5178 + gen_helper_neon_cgt_f32(CPU_T001);
5183 5179 if (op == 27)
5184 5180 gen_op_notl_T0();
5185 5181 break;
5186 5182 case 25: case 28: /* Float VCGE #0, Float VCLT #0 */
5187 5183 gen_op_movl_T1_im(0);
5188   - gen_op_neon_cge_f32();
  5184 + gen_helper_neon_cge_f32(CPU_T001);
5189 5185 if (op == 28)
5190 5186 gen_op_notl_T0();
5191 5187 break;
5192 5188 case 26: /* Float VCEQ #0 */
5193 5189 gen_op_movl_T1_im(0);
5194   - gen_op_neon_ceq_f32();
  5190 + gen_helper_neon_ceq_f32(CPU_T001);
5195 5191 break;
5196 5192 case 30: /* Float VABS */
5197 5193 gen_vfp_abs(0);
... ... @@ -5206,8 +5202,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5206 5202 case 33: /* VTRN */
5207 5203 NEON_GET_REG(T1, rd, pass);
5208 5204 switch (size) {
5209   - case 0: gen_op_neon_trn_u8(); break;
5210   - case 1: gen_op_neon_trn_u16(); break;
  5205 + case 0: gen_helper_neon_trn_u8(); break;
  5206 + case 1: gen_helper_neon_trn_u16(); break;
5211 5207 case 2: abort();
5212 5208 default: return 1;
5213 5209 }
... ... @@ -5281,12 +5277,12 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5281 5277 NEON_SET_REG(T0, rm, 0);
5282 5278 }
5283 5279 if (insn & (1 << 16)) {
5284   - gen_op_neon_dup_u8(((insn >> 17) & 3) * 8);
  5280 + gen_neon_dup_u8(cpu_T[0], ((insn >> 17) & 3) * 8);
5285 5281 } else if (insn & (1 << 17)) {
5286 5282 if ((insn >> 18) & 1)
5287   - gen_op_neon_dup_high16();
  5283 + gen_neon_dup_high16(cpu_T[0]);
5288 5284 else
5289   - gen_op_neon_dup_low16();
  5285 + gen_neon_dup_low16(cpu_T[0]);
5290 5286 }
5291 5287 for (pass = 0; pass < (q ? 4 : 2); pass++) {
5292 5288 NEON_SET_REG(T0, rd, pass);
... ... @@ -8324,6 +8320,8 @@ static inline int gen_intermediate_code_internal(CPUState *env,
8324 8320 cpu_F1s = tcg_temp_new(TCG_TYPE_I32);
8325 8321 cpu_F0d = tcg_temp_new(TCG_TYPE_I64);
8326 8322 cpu_F1d = tcg_temp_new(TCG_TYPE_I64);
  8323 + cpu_V0 = cpu_F0d;
  8324 + cpu_V1 = cpu_F1d;
8327 8325 next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
8328 8326 lj = -1;
8329 8327 /* Reset the conditional execution bits immediately. This avoids
... ...