Commit ad69471ce5e1284e1cacd053bb0fe8d6175a2f9e

Authored by pbrook
1 parent 8f8e3aa4

ARM TCG conversion 14/16.

git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4151 c046a42c-6fe2-441c-8c8c-71466251a162
Makefile.target
@@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o @@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o
211 endif 211 endif
212 212
213 ifeq ($(TARGET_BASE_ARCH), arm) 213 ifeq ($(TARGET_BASE_ARCH), arm)
214 -LIBOBJS+= op_helper.o helper.o 214 +LIBOBJS+= op_helper.o helper.o neon_helper.o
215 endif 215 endif
216 216
217 ifeq ($(TARGET_BASE_ARCH), sh4) 217 ifeq ($(TARGET_BASE_ARCH), sh4)
target-arm/helper.c
@@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env) @@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env)
256 free(env); 256 free(env);
257 } 257 }
258 258
259 -/* Polynomial multiplication is like integer multiplcation except the  
260 - partial products are XORed, not added. */  
261 -uint32_t helper_neon_mul_p8(uint32_t op1, uint32_t op2)  
262 -{  
263 - uint32_t mask;  
264 - uint32_t result;  
265 - result = 0;  
266 - while (op1) {  
267 - mask = 0;  
268 - if (op1 & 1)  
269 - mask |= 0xff;  
270 - if (op1 & (1 << 8))  
271 - mask |= (0xff << 8);  
272 - if (op1 & (1 << 16))  
273 - mask |= (0xff << 16);  
274 - if (op1 & (1 << 24))  
275 - mask |= (0xff << 24);  
276 - result ^= op2 & mask;  
277 - op1 = (op1 >> 1) & 0x7f7f7f7f;  
278 - op2 = (op2 << 1) & 0xfefefefe;  
279 - }  
280 - return result;  
281 -}  
282 -  
283 uint32_t cpsr_read(CPUARMState *env) 259 uint32_t cpsr_read(CPUARMState *env)
284 { 260 {
285 int ZF; 261 int ZF;
@@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x) @@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x)
376 return x; 352 return x;
377 } 353 }
378 354
  355 +uint32_t HELPER(abs)(uint32_t x)
  356 +{
  357 + return ((int32_t)x < 0) ? -x : x;
  358 +}
  359 +
379 #if defined(CONFIG_USER_ONLY) 360 #if defined(CONFIG_USER_ONLY)
380 361
381 void do_interrupt (CPUState *env) 362 void do_interrupt (CPUState *env)
target-arm/helpers.h
@@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t)) @@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t))
84 DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t)) 84 DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t))
85 DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t)) 85 DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t))
86 DEF_HELPER_1_1(rbit, uint32_t, (uint32_t)) 86 DEF_HELPER_1_1(rbit, uint32_t, (uint32_t))
  87 +DEF_HELPER_1_1(abs, uint32_t, (uint32_t))
87 88
88 #define PAS_OP(pfx) \ 89 #define PAS_OP(pfx) \
89 DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \ 90 DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
@@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *)) @@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *))
208 DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *)) 209 DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *))
209 DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *)) 210 DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *))
210 DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t)) 211 DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t))
  212 +DEF_HELPER_1_2(neon_add_saturate_u64, uint64_t, (uint64_t, uint64_t))
  213 +DEF_HELPER_1_2(neon_add_saturate_s64, uint64_t, (uint64_t, uint64_t))
  214 +DEF_HELPER_1_2(neon_sub_saturate_u64, uint64_t, (uint64_t, uint64_t))
  215 +DEF_HELPER_1_2(neon_sub_saturate_s64, uint64_t, (uint64_t, uint64_t))
211 216
212 DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t)) 217 DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t))
213 DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t)) 218 DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t))
@@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t)) @@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t))
223 DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t)) 228 DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t))
224 DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t)) 229 DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t))
225 230
  231 +/* neon_helper.c */
  232 +DEF_HELPER_1_3(neon_qadd_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  233 +DEF_HELPER_1_3(neon_qadd_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  234 +DEF_HELPER_1_3(neon_qadd_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  235 +DEF_HELPER_1_3(neon_qadd_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  236 +DEF_HELPER_1_3(neon_qsub_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  237 +DEF_HELPER_1_3(neon_qsub_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  238 +DEF_HELPER_1_3(neon_qsub_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  239 +DEF_HELPER_1_3(neon_qsub_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  240 +
  241 +DEF_HELPER_1_2(neon_hadd_s8, uint32_t, (uint32_t, uint32_t))
  242 +DEF_HELPER_1_2(neon_hadd_u8, uint32_t, (uint32_t, uint32_t))
  243 +DEF_HELPER_1_2(neon_hadd_s16, uint32_t, (uint32_t, uint32_t))
  244 +DEF_HELPER_1_2(neon_hadd_u16, uint32_t, (uint32_t, uint32_t))
  245 +DEF_HELPER_1_2(neon_hadd_s32, int32_t, (int32_t, int32_t))
  246 +DEF_HELPER_1_2(neon_hadd_u32, uint32_t, (uint32_t, uint32_t))
  247 +DEF_HELPER_1_2(neon_rhadd_s8, uint32_t, (uint32_t, uint32_t))
  248 +DEF_HELPER_1_2(neon_rhadd_u8, uint32_t, (uint32_t, uint32_t))
  249 +DEF_HELPER_1_2(neon_rhadd_s16, uint32_t, (uint32_t, uint32_t))
  250 +DEF_HELPER_1_2(neon_rhadd_u16, uint32_t, (uint32_t, uint32_t))
  251 +DEF_HELPER_1_2(neon_rhadd_s32, int32_t, (int32_t, int32_t))
  252 +DEF_HELPER_1_2(neon_rhadd_u32, uint32_t, (uint32_t, uint32_t))
  253 +DEF_HELPER_1_2(neon_hsub_s8, uint32_t, (uint32_t, uint32_t))
  254 +DEF_HELPER_1_2(neon_hsub_u8, uint32_t, (uint32_t, uint32_t))
  255 +DEF_HELPER_1_2(neon_hsub_s16, uint32_t, (uint32_t, uint32_t))
  256 +DEF_HELPER_1_2(neon_hsub_u16, uint32_t, (uint32_t, uint32_t))
  257 +DEF_HELPER_1_2(neon_hsub_s32, int32_t, (int32_t, int32_t))
  258 +DEF_HELPER_1_2(neon_hsub_u32, uint32_t, (uint32_t, uint32_t))
  259 +
  260 +DEF_HELPER_1_2(neon_cgt_u8, uint32_t, (uint32_t, uint32_t))
  261 +DEF_HELPER_1_2(neon_cgt_s8, uint32_t, (uint32_t, uint32_t))
  262 +DEF_HELPER_1_2(neon_cgt_u16, uint32_t, (uint32_t, uint32_t))
  263 +DEF_HELPER_1_2(neon_cgt_s16, uint32_t, (uint32_t, uint32_t))
  264 +DEF_HELPER_1_2(neon_cgt_u32, uint32_t, (uint32_t, uint32_t))
  265 +DEF_HELPER_1_2(neon_cgt_s32, uint32_t, (uint32_t, uint32_t))
  266 +DEF_HELPER_1_2(neon_cge_u8, uint32_t, (uint32_t, uint32_t))
  267 +DEF_HELPER_1_2(neon_cge_s8, uint32_t, (uint32_t, uint32_t))
  268 +DEF_HELPER_1_2(neon_cge_u16, uint32_t, (uint32_t, uint32_t))
  269 +DEF_HELPER_1_2(neon_cge_s16, uint32_t, (uint32_t, uint32_t))
  270 +DEF_HELPER_1_2(neon_cge_u32, uint32_t, (uint32_t, uint32_t))
  271 +DEF_HELPER_1_2(neon_cge_s32, uint32_t, (uint32_t, uint32_t))
  272 +
  273 +DEF_HELPER_1_2(neon_min_u8, uint32_t, (uint32_t, uint32_t))
  274 +DEF_HELPER_1_2(neon_min_s8, uint32_t, (uint32_t, uint32_t))
  275 +DEF_HELPER_1_2(neon_min_u16, uint32_t, (uint32_t, uint32_t))
  276 +DEF_HELPER_1_2(neon_min_s16, uint32_t, (uint32_t, uint32_t))
  277 +DEF_HELPER_1_2(neon_min_u32, uint32_t, (uint32_t, uint32_t))
  278 +DEF_HELPER_1_2(neon_min_s32, uint32_t, (uint32_t, uint32_t))
  279 +DEF_HELPER_1_2(neon_max_u8, uint32_t, (uint32_t, uint32_t))
  280 +DEF_HELPER_1_2(neon_max_s8, uint32_t, (uint32_t, uint32_t))
  281 +DEF_HELPER_1_2(neon_max_u16, uint32_t, (uint32_t, uint32_t))
  282 +DEF_HELPER_1_2(neon_max_s16, uint32_t, (uint32_t, uint32_t))
  283 +DEF_HELPER_1_2(neon_max_u32, uint32_t, (uint32_t, uint32_t))
  284 +DEF_HELPER_1_2(neon_max_s32, uint32_t, (uint32_t, uint32_t))
  285 +DEF_HELPER_1_2(neon_pmin_u8, uint32_t, (uint32_t, uint32_t))
  286 +DEF_HELPER_1_2(neon_pmin_s8, uint32_t, (uint32_t, uint32_t))
  287 +DEF_HELPER_1_2(neon_pmin_u16, uint32_t, (uint32_t, uint32_t))
  288 +DEF_HELPER_1_2(neon_pmin_s16, uint32_t, (uint32_t, uint32_t))
  289 +DEF_HELPER_1_2(neon_pmin_u32, uint32_t, (uint32_t, uint32_t))
  290 +DEF_HELPER_1_2(neon_pmin_s32, uint32_t, (uint32_t, uint32_t))
  291 +DEF_HELPER_1_2(neon_pmax_u8, uint32_t, (uint32_t, uint32_t))
  292 +DEF_HELPER_1_2(neon_pmax_s8, uint32_t, (uint32_t, uint32_t))
  293 +DEF_HELPER_1_2(neon_pmax_u16, uint32_t, (uint32_t, uint32_t))
  294 +DEF_HELPER_1_2(neon_pmax_s16, uint32_t, (uint32_t, uint32_t))
  295 +DEF_HELPER_1_2(neon_pmax_u32, uint32_t, (uint32_t, uint32_t))
  296 +DEF_HELPER_1_2(neon_pmax_s32, uint32_t, (uint32_t, uint32_t))
  297 +
  298 +DEF_HELPER_1_2(neon_abd_u8, uint32_t, (uint32_t, uint32_t))
  299 +DEF_HELPER_1_2(neon_abd_s8, uint32_t, (uint32_t, uint32_t))
  300 +DEF_HELPER_1_2(neon_abd_u16, uint32_t, (uint32_t, uint32_t))
  301 +DEF_HELPER_1_2(neon_abd_s16, uint32_t, (uint32_t, uint32_t))
  302 +DEF_HELPER_1_2(neon_abd_u32, uint32_t, (uint32_t, uint32_t))
  303 +DEF_HELPER_1_2(neon_abd_s32, uint32_t, (uint32_t, uint32_t))
  304 +
  305 +DEF_HELPER_1_2(neon_shl_u8, uint32_t, (uint32_t, uint32_t))
  306 +DEF_HELPER_1_2(neon_shl_s8, uint32_t, (uint32_t, uint32_t))
  307 +DEF_HELPER_1_2(neon_shl_u16, uint32_t, (uint32_t, uint32_t))
  308 +DEF_HELPER_1_2(neon_shl_s16, uint32_t, (uint32_t, uint32_t))
  309 +DEF_HELPER_1_2(neon_shl_u32, uint32_t, (uint32_t, uint32_t))
  310 +DEF_HELPER_1_2(neon_shl_s32, uint32_t, (uint32_t, uint32_t))
  311 +DEF_HELPER_1_2(neon_shl_u64, uint64_t, (uint64_t, uint64_t))
  312 +DEF_HELPER_1_2(neon_shl_s64, uint64_t, (uint64_t, uint64_t))
  313 +DEF_HELPER_1_2(neon_rshl_u8, uint32_t, (uint32_t, uint32_t))
  314 +DEF_HELPER_1_2(neon_rshl_s8, uint32_t, (uint32_t, uint32_t))
  315 +DEF_HELPER_1_2(neon_rshl_u16, uint32_t, (uint32_t, uint32_t))
  316 +DEF_HELPER_1_2(neon_rshl_s16, uint32_t, (uint32_t, uint32_t))
  317 +DEF_HELPER_1_2(neon_rshl_u32, uint32_t, (uint32_t, uint32_t))
  318 +DEF_HELPER_1_2(neon_rshl_s32, uint32_t, (uint32_t, uint32_t))
  319 +DEF_HELPER_1_2(neon_rshl_u64, uint64_t, (uint64_t, uint64_t))
  320 +DEF_HELPER_1_2(neon_rshl_s64, uint64_t, (uint64_t, uint64_t))
  321 +DEF_HELPER_1_3(neon_qshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  322 +DEF_HELPER_1_3(neon_qshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  323 +DEF_HELPER_1_3(neon_qshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  324 +DEF_HELPER_1_3(neon_qshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  325 +DEF_HELPER_1_3(neon_qshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  326 +DEF_HELPER_1_3(neon_qshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  327 +DEF_HELPER_1_3(neon_qshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  328 +DEF_HELPER_1_3(neon_qshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  329 +DEF_HELPER_1_3(neon_qrshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  330 +DEF_HELPER_1_3(neon_qrshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  331 +DEF_HELPER_1_3(neon_qrshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  332 +DEF_HELPER_1_3(neon_qrshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  333 +DEF_HELPER_1_3(neon_qrshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  334 +DEF_HELPER_1_3(neon_qrshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  335 +DEF_HELPER_1_3(neon_qrshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  336 +DEF_HELPER_1_3(neon_qrshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  337 +
  338 +DEF_HELPER_1_2(neon_add_u8, uint32_t, (uint32_t, uint32_t))
  339 +DEF_HELPER_1_2(neon_add_u16, uint32_t, (uint32_t, uint32_t))
  340 +DEF_HELPER_1_2(neon_padd_u8, uint32_t, (uint32_t, uint32_t))
  341 +DEF_HELPER_1_2(neon_padd_u16, uint32_t, (uint32_t, uint32_t))
  342 +DEF_HELPER_1_2(neon_sub_u8, uint32_t, (uint32_t, uint32_t))
  343 +DEF_HELPER_1_2(neon_sub_u16, uint32_t, (uint32_t, uint32_t))
  344 +DEF_HELPER_1_2(neon_mul_u8, uint32_t, (uint32_t, uint32_t))
  345 +DEF_HELPER_1_2(neon_mul_u16, uint32_t, (uint32_t, uint32_t))
  346 +DEF_HELPER_1_2(neon_mul_p8, uint32_t, (uint32_t, uint32_t))
  347 +
  348 +DEF_HELPER_1_2(neon_tst_u8, uint32_t, (uint32_t, uint32_t))
  349 +DEF_HELPER_1_2(neon_tst_u16, uint32_t, (uint32_t, uint32_t))
  350 +DEF_HELPER_1_2(neon_tst_u32, uint32_t, (uint32_t, uint32_t))
  351 +DEF_HELPER_1_2(neon_ceq_u8, uint32_t, (uint32_t, uint32_t))
  352 +DEF_HELPER_1_2(neon_ceq_u16, uint32_t, (uint32_t, uint32_t))
  353 +DEF_HELPER_1_2(neon_ceq_u32, uint32_t, (uint32_t, uint32_t))
  354 +
  355 +DEF_HELPER_1_1(neon_abs_s8, uint32_t, (uint32_t))
  356 +DEF_HELPER_1_1(neon_abs_s16, uint32_t, (uint32_t))
  357 +DEF_HELPER_1_1(neon_clz_u8, uint32_t, (uint32_t))
  358 +DEF_HELPER_1_1(neon_clz_u16, uint32_t, (uint32_t))
  359 +DEF_HELPER_1_1(neon_cls_s8, uint32_t, (uint32_t))
  360 +DEF_HELPER_1_1(neon_cls_s16, uint32_t, (uint32_t))
  361 +DEF_HELPER_1_1(neon_cls_s32, uint32_t, (uint32_t))
  362 +DEF_HELPER_1_1(neon_cnt_u8, uint32_t, (uint32_t))
  363 +
  364 +DEF_HELPER_1_3(neon_qdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  365 +DEF_HELPER_1_3(neon_qrdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  366 +DEF_HELPER_1_3(neon_qdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  367 +DEF_HELPER_1_3(neon_qrdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  368 +
  369 +DEF_HELPER_1_1(neon_narrow_u8, uint32_t, (uint64_t))
  370 +DEF_HELPER_1_1(neon_narrow_u16, uint32_t, (uint64_t))
  371 +DEF_HELPER_1_2(neon_narrow_sat_u8, uint32_t, (CPUState *, uint64_t))
  372 +DEF_HELPER_1_2(neon_narrow_sat_s8, uint32_t, (CPUState *, uint64_t))
  373 +DEF_HELPER_1_2(neon_narrow_sat_u16, uint32_t, (CPUState *, uint64_t))
  374 +DEF_HELPER_1_2(neon_narrow_sat_s16, uint32_t, (CPUState *, uint64_t))
  375 +DEF_HELPER_1_2(neon_narrow_sat_u32, uint32_t, (CPUState *, uint64_t))
  376 +DEF_HELPER_1_2(neon_narrow_sat_s32, uint32_t, (CPUState *, uint64_t))
  377 +DEF_HELPER_1_1(neon_narrow_high_u8, uint32_t, (uint64_t))
  378 +DEF_HELPER_1_1(neon_narrow_high_u16, uint32_t, (uint64_t))
  379 +DEF_HELPER_1_1(neon_narrow_round_high_u8, uint32_t, (uint64_t))
  380 +DEF_HELPER_1_1(neon_narrow_round_high_u16, uint32_t, (uint64_t))
  381 +DEF_HELPER_1_1(neon_widen_u8, uint64_t, (uint32_t))
  382 +DEF_HELPER_1_1(neon_widen_s8, uint64_t, (uint32_t))
  383 +DEF_HELPER_1_1(neon_widen_u16, uint64_t, (uint32_t))
  384 +DEF_HELPER_1_1(neon_widen_s16, uint64_t, (uint32_t))
  385 +
  386 +DEF_HELPER_1_2(neon_addl_u16, uint64_t, (uint64_t, uint64_t))
  387 +DEF_HELPER_1_2(neon_addl_u32, uint64_t, (uint64_t, uint64_t))
  388 +DEF_HELPER_1_2(neon_paddl_u16, uint64_t, (uint64_t, uint64_t))
  389 +DEF_HELPER_1_2(neon_paddl_u32, uint64_t, (uint64_t, uint64_t))
  390 +DEF_HELPER_1_2(neon_subl_u16, uint64_t, (uint64_t, uint64_t))
  391 +DEF_HELPER_1_2(neon_subl_u32, uint64_t, (uint64_t, uint64_t))
  392 +DEF_HELPER_1_3(neon_addl_saturate_s32, uint64_t, (CPUState *, uint64_t, uint64_t))
  393 +DEF_HELPER_1_3(neon_addl_saturate_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  394 +DEF_HELPER_1_2(neon_abdl_u16, uint64_t, (uint32_t, uint32_t))
  395 +DEF_HELPER_1_2(neon_abdl_s16, uint64_t, (uint32_t, uint32_t))
  396 +DEF_HELPER_1_2(neon_abdl_u32, uint64_t, (uint32_t, uint32_t))
  397 +DEF_HELPER_1_2(neon_abdl_s32, uint64_t, (uint32_t, uint32_t))
  398 +DEF_HELPER_1_2(neon_abdl_u64, uint64_t, (uint32_t, uint32_t))
  399 +DEF_HELPER_1_2(neon_abdl_s64, uint64_t, (uint32_t, uint32_t))
  400 +DEF_HELPER_1_2(neon_mull_u8, uint64_t, (uint32_t, uint32_t))
  401 +DEF_HELPER_1_2(neon_mull_s8, uint64_t, (uint32_t, uint32_t))
  402 +DEF_HELPER_1_2(neon_mull_u16, uint64_t, (uint32_t, uint32_t))
  403 +DEF_HELPER_1_2(neon_mull_s16, uint64_t, (uint32_t, uint32_t))
  404 +
  405 +DEF_HELPER_1_1(neon_negl_u16, uint64_t, (uint64_t))
  406 +DEF_HELPER_1_1(neon_negl_u32, uint64_t, (uint64_t))
  407 +DEF_HELPER_1_1(neon_negl_u64, uint64_t, (uint64_t))
  408 +
  409 +DEF_HELPER_1_2(neon_qabs_s8, uint32_t, (CPUState *, uint32_t))
  410 +DEF_HELPER_1_2(neon_qabs_s16, uint32_t, (CPUState *, uint32_t))
  411 +DEF_HELPER_1_2(neon_qabs_s32, uint32_t, (CPUState *, uint32_t))
  412 +DEF_HELPER_1_2(neon_qneg_s8, uint32_t, (CPUState *, uint32_t))
  413 +DEF_HELPER_1_2(neon_qneg_s16, uint32_t, (CPUState *, uint32_t))
  414 +DEF_HELPER_1_2(neon_qneg_s32, uint32_t, (CPUState *, uint32_t))
  415 +
  416 +DEF_HELPER_0_0(neon_trn_u8, void, (void))
  417 +DEF_HELPER_0_0(neon_trn_u16, void, (void))
  418 +DEF_HELPER_0_0(neon_unzip_u8, void, (void))
  419 +DEF_HELPER_0_0(neon_zip_u8, void, (void))
  420 +DEF_HELPER_0_0(neon_zip_u16, void, (void))
  421 +
  422 +DEF_HELPER_1_2(neon_min_f32, uint32_t, (uint32_t, uint32_t))
  423 +DEF_HELPER_1_2(neon_max_f32, uint32_t, (uint32_t, uint32_t))
  424 +DEF_HELPER_1_2(neon_abd_f32, uint32_t, (uint32_t, uint32_t))
  425 +DEF_HELPER_1_2(neon_add_f32, uint32_t, (uint32_t, uint32_t))
  426 +DEF_HELPER_1_2(neon_sub_f32, uint32_t, (uint32_t, uint32_t))
  427 +DEF_HELPER_1_2(neon_mul_f32, uint32_t, (uint32_t, uint32_t))
  428 +DEF_HELPER_1_2(neon_ceq_f32, uint32_t, (uint32_t, uint32_t))
  429 +DEF_HELPER_1_2(neon_cge_f32, uint32_t, (uint32_t, uint32_t))
  430 +DEF_HELPER_1_2(neon_cgt_f32, uint32_t, (uint32_t, uint32_t))
  431 +DEF_HELPER_1_2(neon_acge_f32, uint32_t, (uint32_t, uint32_t))
  432 +DEF_HELPER_1_2(neon_acgt_f32, uint32_t, (uint32_t, uint32_t))
  433 +
226 #undef DEF_HELPER 434 #undef DEF_HELPER
227 #undef DEF_HELPER_0_0 435 #undef DEF_HELPER_0_0
228 #undef DEF_HELPER_0_1 436 #undef DEF_HELPER_0_1
target-arm/neon_helper.c 0 โ†’ 100644
  1 +#include <stdlib.h>
  2 +#include <stdio.h>
  3 +
  4 +#include "cpu.h"
  5 +#include "exec-all.h"
  6 +#include "helpers.h"
  7 +
  8 +#define SIGNBIT (uint32_t)0x80000000
  9 +#define SIGNBIT64 ((uint64_t)1 << 63)
  10 +
  11 +#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
  12 +
  13 +static float_status neon_float_status;
  14 +#define NFS &neon_float_status
  15 +
  16 +/* Helper routines to perform bitwise copies between float and int. */
  17 +static inline float32 vfp_itos(uint32_t i)
  18 +{
  19 + union {
  20 + uint32_t i;
  21 + float32 s;
  22 + } v;
  23 +
  24 + v.i = i;
  25 + return v.s;
  26 +}
  27 +
  28 +static inline uint32_t vfp_stoi(float32 s)
  29 +{
  30 + union {
  31 + uint32_t i;
  32 + float32 s;
  33 + } v;
  34 +
  35 + v.s = s;
  36 + return v.i;
  37 +}
  38 +
  39 +#define NEON_TYPE1(name, type) \
  40 +typedef struct \
  41 +{ \
  42 + type v1; \
  43 +} neon_##name;
  44 +#ifdef WORDS_BIGENDIAN
  45 +#define NEON_TYPE2(name, type) \
  46 +typedef struct \
  47 +{ \
  48 + type v2; \
  49 + type v1; \
  50 +} neon_##name;
  51 +#define NEON_TYPE4(name, type) \
  52 +typedef struct \
  53 +{ \
  54 + type v4; \
  55 + type v3; \
  56 + type v2; \
  57 + type v1; \
  58 +} neon_##name;
  59 +#else
  60 +#define NEON_TYPE2(name, type) \
  61 +typedef struct \
  62 +{ \
  63 + type v1; \
  64 + type v2; \
  65 +} neon_##name;
  66 +#define NEON_TYPE4(name, type) \
  67 +typedef struct \
  68 +{ \
  69 + type v1; \
  70 + type v2; \
  71 + type v3; \
  72 + type v4; \
  73 +} neon_##name;
  74 +#endif
  75 +
  76 +NEON_TYPE4(s8, int8_t)
  77 +NEON_TYPE4(u8, uint8_t)
  78 +NEON_TYPE2(s16, int16_t)
  79 +NEON_TYPE2(u16, uint16_t)
  80 +NEON_TYPE1(s32, int32_t)
  81 +NEON_TYPE1(u32, uint32_t)
  82 +#undef NEON_TYPE4
  83 +#undef NEON_TYPE2
  84 +#undef NEON_TYPE1
  85 +
  86 +/* Copy from a uint32_t to a vector structure type. */
  87 +#define NEON_UNPACK(vtype, dest, val) do { \
  88 + union { \
  89 + vtype v; \
  90 + uint32_t i; \
  91 + } conv_u; \
  92 + conv_u.i = (val); \
  93 + dest = conv_u.v; \
  94 + } while(0)
  95 +
  96 +/* Copy from a vector structure type to a uint32_t. */
  97 +#define NEON_PACK(vtype, dest, val) do { \
  98 + union { \
  99 + vtype v; \
  100 + uint32_t i; \
  101 + } conv_u; \
  102 + conv_u.v = (val); \
  103 + dest = conv_u.i; \
  104 + } while(0)
  105 +
  106 +#define NEON_DO1 \
  107 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
  108 +#define NEON_DO2 \
  109 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  110 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
  111 +#define NEON_DO4 \
  112 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  113 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
  114 + NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
  115 + NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
  116 +
  117 +#define NEON_VOP_BODY(vtype, n) \
  118 +{ \
  119 + uint32_t res; \
  120 + vtype vsrc1; \
  121 + vtype vsrc2; \
  122 + vtype vdest; \
  123 + NEON_UNPACK(vtype, vsrc1, arg1); \
  124 + NEON_UNPACK(vtype, vsrc2, arg2); \
  125 + NEON_DO##n; \
  126 + NEON_PACK(vtype, res, vdest); \
  127 + return res; \
  128 +}
  129 +
  130 +#define NEON_VOP(name, vtype, n) \
  131 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  132 +NEON_VOP_BODY(vtype, n)
  133 +
  134 +#define NEON_VOP_ENV(name, vtype, n) \
  135 +uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
  136 +NEON_VOP_BODY(vtype, n)
  137 +
  138 +/* Pairwise operations. */
  139 +/* For 32-bit elements each segment only contains a single element, so
  140 + the elementwise and pairwise operations are the same. */
  141 +#define NEON_PDO2 \
  142 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  143 + NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
  144 +#define NEON_PDO4 \
  145 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  146 + NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
  147 + NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
  148 + NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
  149 +
  150 +#define NEON_POP(name, vtype, n) \
  151 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  152 +{ \
  153 + uint32_t res; \
  154 + vtype vsrc1; \
  155 + vtype vsrc2; \
  156 + vtype vdest; \
  157 + NEON_UNPACK(vtype, vsrc1, arg1); \
  158 + NEON_UNPACK(vtype, vsrc2, arg2); \
  159 + NEON_PDO##n; \
  160 + NEON_PACK(vtype, res, vdest); \
  161 + return res; \
  162 +}
  163 +
  164 +/* Unary operators. */
  165 +#define NEON_VOP1(name, vtype, n) \
  166 +uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
  167 +{ \
  168 + vtype vsrc1; \
  169 + vtype vdest; \
  170 + NEON_UNPACK(vtype, vsrc1, arg); \
  171 + NEON_DO##n; \
  172 + NEON_PACK(vtype, arg, vdest); \
  173 + return arg; \
  174 +}
  175 +
  176 +
  177 +#define NEON_USAT(dest, src1, src2, type) do { \
  178 + uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  179 + if (tmp != (type)tmp) { \
  180 + SET_QC(); \
  181 + dest = ~0; \
  182 + } else { \
  183 + dest = tmp; \
  184 + }} while(0)
  185 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  186 +NEON_VOP_ENV(qadd_u8, neon_u8, 4)
  187 +#undef NEON_FN
  188 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  189 +NEON_VOP_ENV(qadd_u16, neon_u16, 2)
  190 +#undef NEON_FN
  191 +#undef NEON_USAT
  192 +
  193 +#define NEON_SSAT(dest, src1, src2, type) do { \
  194 + int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  195 + if (tmp != (type)tmp) { \
  196 + SET_QC(); \
  197 + if (src2 > 0) { \
  198 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  199 + } else { \
  200 + tmp = 1 << (sizeof(type) * 8 - 1); \
  201 + } \
  202 + } \
  203 + dest = tmp; \
  204 + } while(0)
  205 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  206 +NEON_VOP_ENV(qadd_s8, neon_s8, 4)
  207 +#undef NEON_FN
  208 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  209 +NEON_VOP_ENV(qadd_s16, neon_s16, 2)
  210 +#undef NEON_FN
  211 +#undef NEON_SSAT
  212 +
  213 +#define NEON_USAT(dest, src1, src2, type) do { \
  214 + uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  215 + if (tmp != (type)tmp) { \
  216 + SET_QC(); \
  217 + dest = 0; \
  218 + } else { \
  219 + dest = tmp; \
  220 + }} while(0)
  221 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  222 +NEON_VOP_ENV(qsub_u8, neon_u8, 4)
  223 +#undef NEON_FN
  224 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  225 +NEON_VOP_ENV(qsub_u16, neon_u16, 2)
  226 +#undef NEON_FN
  227 +#undef NEON_USAT
  228 +
  229 +#define NEON_SSAT(dest, src1, src2, type) do { \
  230 + int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  231 + if (tmp != (type)tmp) { \
  232 + SET_QC(); \
  233 + if (src2 < 0) { \
  234 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  235 + } else { \
  236 + tmp = 1 << (sizeof(type) * 8 - 1); \
  237 + } \
  238 + } \
  239 + dest = tmp; \
  240 + } while(0)
  241 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  242 +NEON_VOP_ENV(qsub_s8, neon_s8, 4)
  243 +#undef NEON_FN
  244 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  245 +NEON_VOP_ENV(qsub_s16, neon_s16, 2)
  246 +#undef NEON_FN
  247 +#undef NEON_SSAT
  248 +
  249 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
  250 +NEON_VOP(hadd_s8, neon_s8, 4)
  251 +NEON_VOP(hadd_u8, neon_u8, 4)
  252 +NEON_VOP(hadd_s16, neon_s16, 2)
  253 +NEON_VOP(hadd_u16, neon_u16, 2)
  254 +#undef NEON_FN
  255 +
  256 +int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
  257 +{
  258 + int32_t dest;
  259 +
  260 + dest = (src1 >> 1) + (src2 >> 1);
  261 + if (src1 & src2 & 1)
  262 + dest++;
  263 + return dest;
  264 +}
  265 +
  266 +uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
  267 +{
  268 + uint32_t dest;
  269 +
  270 + dest = (src1 >> 1) + (src2 >> 1);
  271 + if (src1 & src2 & 1)
  272 + dest++;
  273 + return dest;
  274 +}
  275 +
  276 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
  277 +NEON_VOP(rhadd_s8, neon_s8, 4)
  278 +NEON_VOP(rhadd_u8, neon_u8, 4)
  279 +NEON_VOP(rhadd_s16, neon_s16, 2)
  280 +NEON_VOP(rhadd_u16, neon_u16, 2)
  281 +#undef NEON_FN
  282 +
  283 +int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
  284 +{
  285 + int32_t dest;
  286 +
  287 + dest = (src1 >> 1) + (src2 >> 1);
  288 + if ((src1 | src2) & 1)
  289 + dest++;
  290 + return dest;
  291 +}
  292 +
  293 +uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
  294 +{
  295 + uint32_t dest;
  296 +
  297 + dest = (src1 >> 1) + (src2 >> 1);
  298 + if ((src1 | src2) & 1)
  299 + dest++;
  300 + return dest;
  301 +}
  302 +
  303 +#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
  304 +NEON_VOP(hsub_s8, neon_s8, 4)
  305 +NEON_VOP(hsub_u8, neon_u8, 4)
  306 +NEON_VOP(hsub_s16, neon_s16, 2)
  307 +NEON_VOP(hsub_u16, neon_u16, 2)
  308 +#undef NEON_FN
  309 +
  310 +int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
  311 +{
  312 + int32_t dest;
  313 +
  314 + dest = (src1 >> 1) - (src2 >> 1);
  315 + if ((~src1) & src2 & 1)
  316 + dest--;
  317 + return dest;
  318 +}
  319 +
  320 +uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
  321 +{
  322 + uint32_t dest;
  323 +
  324 + dest = (src1 >> 1) - (src2 >> 1);
  325 + if ((~src1) & src2 & 1)
  326 + dest--;
  327 + return dest;
  328 +}
  329 +
  330 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
  331 +NEON_VOP(cgt_s8, neon_s8, 4)
  332 +NEON_VOP(cgt_u8, neon_u8, 4)
  333 +NEON_VOP(cgt_s16, neon_s16, 2)
  334 +NEON_VOP(cgt_u16, neon_u16, 2)
  335 +NEON_VOP(cgt_s32, neon_s32, 1)
  336 +NEON_VOP(cgt_u32, neon_u32, 1)
  337 +#undef NEON_FN
  338 +
  339 +#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
  340 +NEON_VOP(cge_s8, neon_s8, 4)
  341 +NEON_VOP(cge_u8, neon_u8, 4)
  342 +NEON_VOP(cge_s16, neon_s16, 2)
  343 +NEON_VOP(cge_u16, neon_u16, 2)
  344 +NEON_VOP(cge_s32, neon_s32, 1)
  345 +NEON_VOP(cge_u32, neon_u32, 1)
  346 +#undef NEON_FN
  347 +
  348 +#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
  349 +NEON_VOP(min_s8, neon_s8, 4)
  350 +NEON_VOP(min_u8, neon_u8, 4)
  351 +NEON_VOP(min_s16, neon_s16, 2)
  352 +NEON_VOP(min_u16, neon_u16, 2)
  353 +NEON_VOP(min_s32, neon_s32, 1)
  354 +NEON_VOP(min_u32, neon_u32, 1)
  355 +NEON_POP(pmin_s8, neon_s8, 4)
  356 +NEON_POP(pmin_u8, neon_u8, 4)
  357 +NEON_POP(pmin_s16, neon_s16, 2)
  358 +NEON_POP(pmin_u16, neon_u16, 2)
  359 +#undef NEON_FN
  360 +
  361 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
  362 +NEON_VOP(max_s8, neon_s8, 4)
  363 +NEON_VOP(max_u8, neon_u8, 4)
  364 +NEON_VOP(max_s16, neon_s16, 2)
  365 +NEON_VOP(max_u16, neon_u16, 2)
  366 +NEON_VOP(max_s32, neon_s32, 1)
  367 +NEON_VOP(max_u32, neon_u32, 1)
  368 +NEON_POP(pmax_s8, neon_s8, 4)
  369 +NEON_POP(pmax_u8, neon_u8, 4)
  370 +NEON_POP(pmax_s16, neon_s16, 2)
  371 +NEON_POP(pmax_u16, neon_u16, 2)
  372 +#undef NEON_FN
  373 +
  374 +#define NEON_FN(dest, src1, src2) \
  375 + dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
  376 +NEON_VOP(abd_s8, neon_s8, 4)
  377 +NEON_VOP(abd_u8, neon_u8, 4)
  378 +NEON_VOP(abd_s16, neon_s16, 2)
  379 +NEON_VOP(abd_u16, neon_u16, 2)
  380 +NEON_VOP(abd_s32, neon_s32, 1)
  381 +NEON_VOP(abd_u32, neon_u32, 1)
  382 +#undef NEON_FN
  383 +
  384 +#define NEON_FN(dest, src1, src2) do { \
  385 + int8_t tmp; \
  386 + tmp = (int8_t)src2; \
  387 + if (tmp >= sizeof(src1) * 8 || tmp <= -sizeof(src1) * 8) { \
  388 + dest = 0; \
  389 + } else if (tmp < 0) { \
  390 + dest = src1 >> -tmp; \
  391 + } else { \
  392 + dest = src1 << tmp; \
  393 + }} while (0)
  394 +NEON_VOP(shl_u8, neon_u8, 4)
  395 +NEON_VOP(shl_u16, neon_u16, 2)
  396 +NEON_VOP(shl_u32, neon_u32, 1)
  397 +#undef NEON_FN
  398 +
  399 +uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
  400 +{
  401 + int8_t shift = (int8_t)shiftop;
  402 + if (shift >= 64 || shift <= -64) {
  403 + val = 0;
  404 + } else if (shift < 0) {
  405 + val >>= -shift;
  406 + } else {
  407 + val <<= shift;
  408 + }
  409 + return val;
  410 +}
  411 +
  412 +#define NEON_FN(dest, src1, src2) do { \
  413 + int8_t tmp; \
  414 + tmp = (int8_t)src2; \
  415 + if (tmp >= sizeof(src1) * 8) { \
  416 + dest = 0; \
  417 + } else if (tmp <= -sizeof(src1) * 8) { \
  418 + dest = src1 >> (sizeof(src1) * 8 - 1); \
  419 + } else if (tmp < 0) { \
  420 + dest = src1 >> -tmp; \
  421 + } else { \
  422 + dest = src1 << tmp; \
  423 + }} while (0)
  424 +NEON_VOP(shl_s8, neon_s8, 4)
  425 +NEON_VOP(shl_s16, neon_s16, 2)
  426 +NEON_VOP(shl_s32, neon_s32, 1)
  427 +#undef NEON_FN
  428 +
  429 +uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
  430 +{
  431 + int8_t shift = (int8_t)shiftop;
  432 + int64_t val = valop;
  433 + if (shift >= 64) {
  434 + val = 0;
  435 + } else if (shift <= -64) {
  436 + val >>= 63;
  437 + } else if (shift < 0) {
  438 + val >>= -shift;
  439 + } else {
  440 + val <<= shift;
  441 + }
  442 + return val;
  443 +}
  444 +
  445 +#define NEON_FN(dest, src1, src2) do { \
  446 + int8_t tmp; \
  447 + tmp = (int8_t)src2; \
  448 + if (tmp >= sizeof(src1) * 8) { \
  449 + dest = 0; \
  450 + } else if (tmp < -sizeof(src1) * 8) { \
  451 + dest >>= sizeof(src1) * 8 - 1; \
  452 + } else if (tmp == -sizeof(src1) * 8) { \
  453 + dest = src1 >> (tmp - 1); \
  454 + dest++; \
  455 + src2 >>= 1; \
  456 + } else if (tmp < 0) { \
  457 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  458 + } else { \
  459 + dest = src1 << tmp; \
  460 + }} while (0)
  461 +NEON_VOP(rshl_s8, neon_s8, 4)
  462 +NEON_VOP(rshl_s16, neon_s16, 2)
  463 +NEON_VOP(rshl_s32, neon_s32, 1)
  464 +#undef NEON_FN
  465 +
  466 +uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
  467 +{
  468 + int8_t shift = (int8_t)shiftop;
  469 + int64_t val = valop;
  470 + if (shift >= 64) {
  471 + val = 0;
  472 + } else if (shift < -64) {
  473 + val >>= 63;
  474 + } else if (shift == -63) {
  475 + val >>= 63;
  476 + val++;
  477 + val >>= 1;
  478 + } else if (shift < 0) {
  479 + val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
  480 + } else {
  481 + val <<= shift;
  482 + }
  483 + return val;
  484 +}
  485 +
  486 +#define NEON_FN(dest, src1, src2) do { \
  487 + int8_t tmp; \
  488 + tmp = (int8_t)src2; \
  489 + if (tmp >= sizeof(src1) * 8 || tmp < -sizeof(src1) * 8) { \
  490 + dest = 0; \
  491 + } else if (tmp == -sizeof(src1) * 8) { \
  492 + dest = src1 >> (tmp - 1); \
  493 + } else if (tmp < 0) { \
  494 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  495 + } else { \
  496 + dest = src1 << tmp; \
  497 + }} while (0)
  498 +NEON_VOP(rshl_u8, neon_u8, 4)
  499 +NEON_VOP(rshl_u16, neon_u16, 2)
  500 +NEON_VOP(rshl_u32, neon_u32, 1)
  501 +#undef NEON_FN
  502 +
  503 +uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
  504 +{
  505 + int8_t shift = (uint8_t)shiftop;
  506 + if (shift >= 64 || shift < 64) {
  507 + val = 0;
  508 + } else if (shift == -64) {
  509 + /* Rounding a 1-bit result just preserves that bit. */
  510 + val >>= 63;
  511 + } if (shift < 0) {
  512 + val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
  513 + val >>= -shift;
  514 + } else {
  515 + val <<= shift;
  516 + }
  517 + return val;
  518 +}
  519 +
  520 +#define NEON_FN(dest, src1, src2) do { \
  521 + int8_t tmp; \
  522 + tmp = (int8_t)src2; \
  523 + if (tmp >= sizeof(src1) * 8) { \
  524 + if (src1) { \
  525 + SET_QC(); \
  526 + dest = ~0; \
  527 + } else { \
  528 + dest = 0; \
  529 + } \
  530 + } else if (tmp <= -sizeof(src1) * 8) { \
  531 + dest = 0; \
  532 + } else if (tmp < 0) { \
  533 + dest = src1 >> -tmp; \
  534 + } else { \
  535 + dest = src1 << tmp; \
  536 + if ((dest >> tmp) != src1) { \
  537 + SET_QC(); \
  538 + dest = ~0; \
  539 + } \
  540 + }} while (0)
  541 +NEON_VOP_ENV(qshl_u8, neon_u8, 4)
  542 +NEON_VOP_ENV(qshl_u16, neon_u16, 2)
  543 +NEON_VOP_ENV(qshl_u32, neon_u32, 1)
  544 +#undef NEON_FN
  545 +
  546 +uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  547 +{
  548 + int8_t shift = (int8_t)shiftop;
  549 + if (shift >= 64) {
  550 + if (val) {
  551 + val = ~(uint64_t)0;
  552 + SET_QC();
  553 + } else {
  554 + val = 0;
  555 + }
  556 + } else if (shift <= -64) {
  557 + val = 0;
  558 + } else if (shift < 0) {
  559 + val >>= -shift;
  560 + } else {
  561 + uint64_t tmp = val;
  562 + val <<= shift;
  563 + if ((val >> shift) != tmp) {
  564 + SET_QC();
  565 + val = ~(uint64_t)0;
  566 + }
  567 + }
  568 + return val;
  569 +}
  570 +
  571 +#define NEON_FN(dest, src1, src2) do { \
  572 + int8_t tmp; \
  573 + tmp = (int8_t)src2; \
  574 + if (tmp >= sizeof(src1) * 8) { \
  575 + if (src1) \
  576 + SET_QC(); \
  577 + dest = src1 >> 31; \
  578 + } else if (tmp <= -sizeof(src1) * 8) { \
  579 + dest = src1 >> 31; \
  580 + } else if (tmp < 0) { \
  581 + dest = src1 >> -tmp; \
  582 + } else { \
  583 + dest = src1 << tmp; \
  584 + if ((dest >> tmp) != src1) { \
  585 + SET_QC(); \
  586 + dest = src2 >> 31; \
  587 + } \
  588 + }} while (0)
  589 +NEON_VOP_ENV(qshl_s8, neon_s8, 4)
  590 +NEON_VOP_ENV(qshl_s16, neon_s16, 2)
  591 +NEON_VOP_ENV(qshl_s32, neon_s32, 1)
  592 +#undef NEON_FN
  593 +
  594 +uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  595 +{
  596 + int8_t shift = (uint8_t)shiftop;
  597 + int64_t val = valop;
  598 + if (shift >= 64) {
  599 + if (val) {
  600 + SET_QC();
  601 + val = (val >> 63) & ~SIGNBIT64;
  602 + }
  603 + } else if (shift <= 64) {
  604 + val >>= 63;
  605 + } else if (shift < 0) {
  606 + val >>= -shift;
  607 + } else {
  608 + int64_t tmp = val;
  609 + val <<= shift;
  610 + if ((val >> shift) != tmp) {
  611 + SET_QC();
  612 + val = (tmp >> 63) ^ ~SIGNBIT64;
  613 + }
  614 + }
  615 + return val;
  616 +}
  617 +
  618 +
  619 +/* FIXME: This is wrong. */
  620 +#define NEON_FN(dest, src1, src2) do { \
  621 + int8_t tmp; \
  622 + tmp = (int8_t)src2; \
  623 + if (tmp < 0) { \
  624 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  625 + } else { \
  626 + dest = src1 << tmp; \
  627 + if ((dest >> tmp) != src1) { \
  628 + SET_QC(); \
  629 + dest = ~0; \
  630 + } \
  631 + }} while (0)
  632 +NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
  633 +NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
  634 +NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
  635 +#undef NEON_FN
  636 +
  637 +uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  638 +{
  639 + int8_t shift = (int8_t)shiftop;
  640 + if (shift < 0) {
  641 + val = (val + (1 << (-1 - shift))) >> -shift;
  642 + } else { \
  643 + uint64_t tmp = val;
  644 + val <<= shift;
  645 + if ((val >> shift) != tmp) {
  646 + SET_QC();
  647 + val = ~0;
  648 + }
  649 + }
  650 + return val;
  651 +}
  652 +
  653 +#define NEON_FN(dest, src1, src2) do { \
  654 + int8_t tmp; \
  655 + tmp = (int8_t)src2; \
  656 + if (tmp < 0) { \
  657 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  658 + } else { \
  659 + dest = src1 << tmp; \
  660 + if ((dest >> tmp) != src1) { \
  661 + SET_QC(); \
  662 + dest = src1 >> 31; \
  663 + } \
  664 + }} while (0)
  665 +NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
  666 +NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
  667 +NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
  668 +#undef NEON_FN
  669 +
  670 +uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  671 +{
  672 + int8_t shift = (uint8_t)shiftop;
  673 + int64_t val = valop;
  674 +
  675 + if (shift < 0) {
  676 + val = (val + (1 << (-1 - shift))) >> -shift;
  677 + } else {
  678 + int64_t tmp = val;;
  679 + val <<= shift;
  680 + if ((val >> shift) != tmp) {
  681 + SET_QC();
  682 + val = tmp >> 31;
  683 + }
  684 + }
  685 + return val;
  686 +}
  687 +
  688 +uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
  689 +{
  690 + uint32_t mask;
  691 + mask = (a ^ b) & 0x80808080u;
  692 + a &= ~0x80808080u;
  693 + b &= ~0x80808080u;
  694 + return (a + b) ^ mask;
  695 +}
  696 +
  697 +uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
  698 +{
  699 + uint32_t mask;
  700 + mask = (a ^ b) & 0x80008000u;
  701 + a &= ~0x80008000u;
  702 + b &= ~0x80008000u;
  703 + return (a + b) ^ mask;
  704 +}
  705 +
  706 +#define NEON_FN(dest, src1, src2) dest = src1 + src2
  707 +NEON_POP(padd_u8, neon_u8, 4)
  708 +NEON_POP(padd_u16, neon_u16, 2)
  709 +#undef NEON_FN
  710 +
  711 +#define NEON_FN(dest, src1, src2) dest = src1 - src2
  712 +NEON_VOP(sub_u8, neon_u8, 4)
  713 +NEON_VOP(sub_u16, neon_u16, 2)
  714 +#undef NEON_FN
  715 +
  716 +#define NEON_FN(dest, src1, src2) dest = src1 * src2
  717 +NEON_VOP(mul_u8, neon_u8, 4)
  718 +NEON_VOP(mul_u16, neon_u16, 2)
  719 +#undef NEON_FN
  720 +
  721 +/* Polynomial multiplication is like integer multiplcation except the
  722 + partial products are XORed, not added. */
  723 +uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
  724 +{
  725 + uint32_t mask;
  726 + uint32_t result;
  727 + result = 0;
  728 + while (op1) {
  729 + mask = 0;
  730 + if (op1 & 1)
  731 + mask |= 0xff;
  732 + if (op1 & (1 << 8))
  733 + mask |= (0xff << 8);
  734 + if (op1 & (1 << 16))
  735 + mask |= (0xff << 16);
  736 + if (op1 & (1 << 24))
  737 + mask |= (0xff << 24);
  738 + result ^= op2 & mask;
  739 + op1 = (op1 >> 1) & 0x7f7f7f7f;
  740 + op2 = (op2 << 1) & 0xfefefefe;
  741 + }
  742 + return result;
  743 +}
  744 +
  745 +#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
  746 +NEON_VOP(tst_u8, neon_u8, 4)
  747 +NEON_VOP(tst_u16, neon_u16, 2)
  748 +NEON_VOP(tst_u32, neon_u32, 1)
  749 +#undef NEON_FN
  750 +
  751 +#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
  752 +NEON_VOP(ceq_u8, neon_u8, 4)
  753 +NEON_VOP(ceq_u16, neon_u16, 2)
  754 +NEON_VOP(ceq_u32, neon_u32, 1)
  755 +#undef NEON_FN
  756 +
  757 +#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
  758 +NEON_VOP1(abs_s8, neon_s8, 4)
  759 +NEON_VOP1(abs_s16, neon_s16, 2)
  760 +#undef NEON_FN
  761 +
  762 +/* Count Leading Sign/Zero Bits. */
  763 +static inline int do_clz8(uint8_t x)
  764 +{
  765 + int n;
  766 + for (n = 8; x; n--)
  767 + x >>= 1;
  768 + return n;
  769 +}
  770 +
  771 +static inline int do_clz16(uint16_t x)
  772 +{
  773 + int n;
  774 + for (n = 16; x; n--)
  775 + x >>= 1;
  776 + return n;
  777 +}
  778 +
  779 +#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
  780 +NEON_VOP1(clz_u8, neon_u8, 4)
  781 +#undef NEON_FN
  782 +
  783 +#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
  784 +NEON_VOP1(clz_u16, neon_u16, 2)
  785 +#undef NEON_FN
  786 +
  787 +#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
  788 +NEON_VOP1(cls_s8, neon_s8, 4)
  789 +#undef NEON_FN
  790 +
  791 +#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
  792 +NEON_VOP1(cls_s16, neon_s16, 2)
  793 +#undef NEON_FN
  794 +
  795 +uint32_t HELPER(neon_cls_s32)(uint32_t x)
  796 +{
  797 + int count;
  798 + if ((int32_t)x < 0)
  799 + x = ~x;
  800 + for (count = 32; x; count--)
  801 + x = x >> 1;
  802 + return count - 1;
  803 +}
  804 +
  805 +/* Bit count. */
  806 +uint32_t HELPER(neon_cnt_u8)(uint32_t x)
  807 +{
  808 + x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
  809 + x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  810 + x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
  811 + return x;
  812 +}
  813 +
  814 +#define NEON_QDMULH16(dest, src1, src2, round) do { \
  815 + uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
  816 + if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
  817 + SET_QC(); \
  818 + tmp = (tmp >> 31) ^ ~SIGNBIT; \
  819 + } \
  820 + tmp <<= 1; \
  821 + if (round) { \
  822 + int32_t old = tmp; \
  823 + tmp += 1 << 15; \
  824 + if ((int32_t)tmp < old) { \
  825 + SET_QC(); \
  826 + tmp = SIGNBIT - 1; \
  827 + } \
  828 + } \
  829 + dest = tmp >> 16; \
  830 + } while(0)
  831 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
  832 +NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
  833 +#undef NEON_FN
  834 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
  835 +NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
  836 +#undef NEON_FN
  837 +#undef NEON_QDMULH16
  838 +
  839 +#define NEON_QDMULH32(dest, src1, src2, round) do { \
  840 + uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
  841 + if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
  842 + SET_QC(); \
  843 + tmp = (tmp >> 63) ^ ~SIGNBIT64; \
  844 + } else { \
  845 + tmp <<= 1; \
  846 + } \
  847 + if (round) { \
  848 + int64_t old = tmp; \
  849 + tmp += (int64_t)1 << 31; \
  850 + if ((int64_t)tmp < old) { \
  851 + SET_QC(); \
  852 + tmp = SIGNBIT64 - 1; \
  853 + } \
  854 + } \
  855 + dest = tmp >> 32; \
  856 + } while(0)
  857 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
  858 +NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
  859 +#undef NEON_FN
  860 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
  861 +NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
  862 +#undef NEON_FN
  863 +#undef NEON_QDMULH32
  864 +
  865 +uint32_t HELPER(neon_narrow_u8)(uint64_t x)
  866 +{
  867 + return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
  868 + | ((x >> 24) & 0xff000000u);
  869 +}
  870 +
  871 +uint32_t HELPER(neon_narrow_u16)(uint64_t x)
  872 +{
  873 + return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
  874 +}
  875 +
  876 +uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
  877 +{
  878 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  879 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  880 +}
  881 +
  882 +uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
  883 +{
  884 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  885 +}
  886 +
  887 +uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
  888 +{
  889 + x &= 0xff80ff80ff80ff80ull;
  890 + x += 0x0080008000800080ull;
  891 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  892 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  893 +}
  894 +
  895 +uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
  896 +{
  897 + x &= 0xffff8000ffff8000ull;
  898 + x += 0x0000800000008000ull;
  899 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  900 +}
  901 +
  902 +uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
  903 +{
  904 + uint16_t s;
  905 + uint8_t d;
  906 + uint32_t res = 0;
  907 +#define SAT8(n) \
  908 + s = x >> n; \
  909 + if (s > 0xff) { \
  910 + d = 0xff; \
  911 + SET_QC(); \
  912 + } else { \
  913 + d = s; \
  914 + } \
  915 + res |= (uint32_t)d << (n / 2);
  916 +
  917 + SAT8(0);
  918 + SAT8(16);
  919 + SAT8(32);
  920 + SAT8(48);
  921 +#undef SAT8
  922 + return res;
  923 +}
  924 +
  925 +uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
  926 +{
  927 + int16_t s;
  928 + uint8_t d;
  929 + uint32_t res = 0;
  930 +#define SAT8(n) \
  931 + s = x >> n; \
  932 + if (s != (int8_t)s) { \
  933 + d = (s >> 15) ^ 0x7f; \
  934 + SET_QC(); \
  935 + } else { \
  936 + d = s; \
  937 + } \
  938 + res |= (uint32_t)d << (n / 2);
  939 +
  940 + SAT8(0);
  941 + SAT8(16);
  942 + SAT8(32);
  943 + SAT8(48);
  944 +#undef SAT8
  945 + return res;
  946 +}
  947 +
  948 +uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
  949 +{
  950 + uint32_t high;
  951 + uint32_t low;
  952 + low = x;
  953 + if (low > 0xffff) {
  954 + low = 0xffff;
  955 + SET_QC();
  956 + }
  957 + high = x >> 32;
  958 + if (high > 0xffff) {
  959 + high = 0xffff;
  960 + SET_QC();
  961 + }
  962 + return low | (high << 16);
  963 +}
  964 +
  965 +uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
  966 +{
  967 + int32_t low;
  968 + int32_t high;
  969 + low = x;
  970 + if (low != (int16_t)low) {
  971 + low = (low >> 31) ^ 0x7fff;
  972 + SET_QC();
  973 + }
  974 + high = x >> 32;
  975 + if (high != (int16_t)high) {
  976 + high = (high >> 31) ^ 0x7fff;
  977 + SET_QC();
  978 + }
  979 + return (uint16_t)low | (high << 16);
  980 +}
  981 +
  982 +uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
  983 +{
  984 + if (x > 0xffffffffu) {
  985 + SET_QC();
  986 + return 0xffffffffu;
  987 + }
  988 + return x;
  989 +}
  990 +
  991 +uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
  992 +{
  993 + if ((int64_t)x != (int32_t)x) {
  994 + SET_QC();
  995 + return (x >> 63) ^ 0x7fffffff;
  996 + }
  997 + return x;
  998 +}
  999 +
  1000 +uint64_t HELPER(neon_widen_u8)(uint32_t x)
  1001 +{
  1002 + uint64_t tmp;
  1003 + uint64_t ret;
  1004 + ret = (uint8_t)x;
  1005 + tmp = (uint8_t)(x >> 8);
  1006 + ret |= tmp << 16;
  1007 + tmp = (uint8_t)(x >> 16);
  1008 + ret |= tmp << 32;
  1009 + tmp = (uint8_t)(x >> 24);
  1010 + ret |= tmp << 48;
  1011 + return ret;
  1012 +}
  1013 +
  1014 +uint64_t HELPER(neon_widen_s8)(uint32_t x)
  1015 +{
  1016 + uint64_t tmp;
  1017 + uint64_t ret;
  1018 + ret = (uint16_t)(int8_t)x;
  1019 + tmp = (uint16_t)(int8_t)(x >> 8);
  1020 + ret |= tmp << 16;
  1021 + tmp = (uint16_t)(int8_t)(x >> 16);
  1022 + ret |= tmp << 32;
  1023 + tmp = (uint16_t)(int8_t)(x >> 24);
  1024 + ret |= tmp << 48;
  1025 + return ret;
  1026 +}
  1027 +
  1028 +uint64_t HELPER(neon_widen_u16)(uint32_t x)
  1029 +{
  1030 + uint64_t high = (uint16_t)(x >> 16);
  1031 + return ((uint16_t)x) | (high << 32);
  1032 +}
  1033 +
  1034 +uint64_t HELPER(neon_widen_s16)(uint32_t x)
  1035 +{
  1036 + uint64_t high = (int16_t)(x >> 16);
  1037 + return ((uint32_t)(int16_t)x) | (high << 32);
  1038 +}
  1039 +
  1040 +uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
  1041 +{
  1042 + uint64_t mask;
  1043 + mask = (a ^ b) & 0x8000800080008000ull;
  1044 + a &= ~0x8000800080008000ull;
  1045 + b &= ~0x8000800080008000ull;
  1046 + return (a + b) ^ mask;
  1047 +}
  1048 +
  1049 +uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
  1050 +{
  1051 + uint64_t mask;
  1052 + mask = (a ^ b) & 0x8000000080000000ull;
  1053 + a &= ~0x8000000080000000ull;
  1054 + b &= ~0x8000000080000000ull;
  1055 + return (a + b) ^ mask;
  1056 +}
  1057 +
  1058 +uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
  1059 +{
  1060 + uint64_t tmp;
  1061 + uint64_t tmp2;
  1062 +
  1063 + tmp = a & 0x0000ffff0000ffffull;
  1064 + tmp += (a >> 16) & 0x0000ffff0000ffffull;
  1065 + tmp2 = b & 0xffff0000ffff0000ull;
  1066 + tmp2 += (b << 16) & 0xffff0000ffff0000ull;
  1067 + return ( tmp & 0xffff)
  1068 + | ((tmp >> 16) & 0xffff0000ull)
  1069 + | ((tmp2 << 16) & 0xffff00000000ull)
  1070 + | ( tmp2 & 0xffff000000000000ull);
  1071 +}
  1072 +
  1073 +uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
  1074 +{
  1075 + uint32_t low = a + (a >> 32);
  1076 + uint32_t high = b + (b >> 32);
  1077 + return low + ((uint64_t)high << 32);
  1078 +}
  1079 +
  1080 +uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
  1081 +{
  1082 + uint64_t mask;
  1083 + mask = (a ^ ~b) & 0x8000800080008000ull;
  1084 + a |= 0x8000800080008000ull;
  1085 + b &= ~0x8000800080008000ull;
  1086 + return (a - b) ^ mask;
  1087 +}
  1088 +
  1089 +uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
  1090 +{
  1091 + uint64_t mask;
  1092 + mask = (a ^ ~b) & 0x8000000080000000ull;
  1093 + a |= 0x8000000080000000ull;
  1094 + b &= ~0x8000000080000000ull;
  1095 + return (a - b) ^ mask;
  1096 +}
  1097 +
  1098 +uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
  1099 +{
  1100 + uint32_t x, y;
  1101 + uint32_t low, high;
  1102 +
  1103 + x = a;
  1104 + y = b;
  1105 + low = x + y;
  1106 + if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1107 + SET_QC();
  1108 + low = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1109 + }
  1110 + x = a >> 32;
  1111 + y = b >> 32;
  1112 + high = x + y;
  1113 + if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1114 + SET_QC();
  1115 + high = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1116 + }
  1117 + return low | ((uint64_t)high << 32);
  1118 +}
  1119 +
  1120 +uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
  1121 +{
  1122 + uint64_t result;
  1123 +
  1124 + result = a + b;
  1125 + if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
  1126 + SET_QC();
  1127 + result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
  1128 + }
  1129 + return result;
  1130 +}
  1131 +
  1132 +#define DO_ABD(dest, x, y, type) do { \
  1133 + type tmp_x = x; \
  1134 + type tmp_y = y; \
  1135 + dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
  1136 + } while(0)
  1137 +
  1138 +uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
  1139 +{
  1140 + uint64_t tmp;
  1141 + uint64_t result;
  1142 + DO_ABD(result, a, b, uint8_t);
  1143 + DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
  1144 + result |= tmp << 16;
  1145 + DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
  1146 + result |= tmp << 32;
  1147 + DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
  1148 + result |= tmp << 48;
  1149 + return result;
  1150 +}
  1151 +
  1152 +uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
  1153 +{
  1154 + uint64_t tmp;
  1155 + uint64_t result;
  1156 + DO_ABD(result, a, b, int8_t);
  1157 + DO_ABD(tmp, a >> 8, b >> 8, int8_t);
  1158 + result |= tmp << 16;
  1159 + DO_ABD(tmp, a >> 16, b >> 16, int8_t);
  1160 + result |= tmp << 32;
  1161 + DO_ABD(tmp, a >> 24, b >> 24, int8_t);
  1162 + result |= tmp << 48;
  1163 + return result;
  1164 +}
  1165 +
  1166 +uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
  1167 +{
  1168 + uint64_t tmp;
  1169 + uint64_t result;
  1170 + DO_ABD(result, a, b, uint16_t);
  1171 + DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
  1172 + return result | (tmp << 32);
  1173 +}
  1174 +
  1175 +uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
  1176 +{
  1177 + uint64_t tmp;
  1178 + uint64_t result;
  1179 + DO_ABD(result, a, b, int16_t);
  1180 + DO_ABD(tmp, a >> 16, b >> 16, int16_t);
  1181 + return result | (tmp << 32);
  1182 +}
  1183 +
  1184 +uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
  1185 +{
  1186 + uint64_t result;
  1187 + DO_ABD(result, a, b, uint32_t);
  1188 + return result;
  1189 +}
  1190 +
  1191 +uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
  1192 +{
  1193 + uint64_t result;
  1194 + DO_ABD(result, a, b, int32_t);
  1195 + return result;
  1196 +}
  1197 +#undef DO_ABD
  1198 +
  1199 +/* Widening multiply. Named type is the source type. */
  1200 +#define DO_MULL(dest, x, y, type1, type2) do { \
  1201 + type1 tmp_x = x; \
  1202 + type1 tmp_y = y; \
  1203 + dest = (type2)((type2)tmp_x * (type2)tmp_y); \
  1204 + } while(0)
  1205 +
  1206 +uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
  1207 +{
  1208 + uint64_t tmp;
  1209 + uint64_t result;
  1210 +
  1211 + DO_MULL(result, a, b, uint8_t, uint16_t);
  1212 + DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
  1213 + result |= tmp << 16;
  1214 + DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
  1215 + result |= tmp << 32;
  1216 + DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
  1217 + result |= tmp << 48;
  1218 + return result;
  1219 +}
  1220 +
  1221 +uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
  1222 +{
  1223 + uint64_t tmp;
  1224 + uint64_t result;
  1225 +
  1226 + DO_MULL(result, a, b, int8_t, uint16_t);
  1227 + DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
  1228 + result |= tmp << 16;
  1229 + DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
  1230 + result |= tmp << 32;
  1231 + DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
  1232 + result |= tmp << 48;
  1233 + return result;
  1234 +}
  1235 +
  1236 +uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
  1237 +{
  1238 + uint64_t tmp;
  1239 + uint64_t result;
  1240 +
  1241 + DO_MULL(result, a, b, uint16_t, uint32_t);
  1242 + DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
  1243 + return result | (tmp << 32);
  1244 +}
  1245 +
  1246 +uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
  1247 +{
  1248 + uint64_t tmp;
  1249 + uint64_t result;
  1250 +
  1251 + DO_MULL(result, a, b, int16_t, uint32_t);
  1252 + DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
  1253 + return result | (tmp << 32);
  1254 +}
  1255 +
  1256 +uint64_t HELPER(neon_negl_u16)(uint64_t x)
  1257 +{
  1258 + uint16_t tmp;
  1259 + uint64_t result;
  1260 + result = (uint16_t)-x;
  1261 + tmp = -(x >> 16);
  1262 + result |= (uint64_t)tmp << 16;
  1263 + tmp = -(x >> 32);
  1264 + result |= (uint64_t)tmp << 32;
  1265 + tmp = -(x >> 48);
  1266 + result |= (uint64_t)tmp << 48;
  1267 + return result;
  1268 +}
  1269 +
  1270 +#include <stdio.h>
  1271 +uint64_t HELPER(neon_negl_u32)(uint64_t x)
  1272 +{
  1273 + uint32_t low = -x;
  1274 + uint32_t high = -(x >> 32);
  1275 + return low | ((uint64_t)high << 32);
  1276 +}
  1277 +
  1278 +/* FIXME: There should be a native op for this. */
  1279 +uint64_t HELPER(neon_negl_u64)(uint64_t x)
  1280 +{
  1281 + return -x;
  1282 +}
  1283 +
  1284 +/* Saturnating sign manuipulation. */
  1285 +/* ??? Make these use NEON_VOP1 */
  1286 +#define DO_QABS8(x) do { \
  1287 + if (x == (int8_t)0x80) { \
  1288 + x = 0x7f; \
  1289 + SET_QC(); \
  1290 + } else if (x < 0) { \
  1291 + x = -x; \
  1292 + }} while (0)
  1293 +uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
  1294 +{
  1295 + neon_s8 vec;
  1296 + NEON_UNPACK(neon_s8, vec, x);
  1297 + DO_QABS8(vec.v1);
  1298 + DO_QABS8(vec.v2);
  1299 + DO_QABS8(vec.v3);
  1300 + DO_QABS8(vec.v4);
  1301 + NEON_PACK(neon_s8, x, vec);
  1302 + return x;
  1303 +}
  1304 +#undef DO_QABS8
  1305 +
  1306 +#define DO_QNEG8(x) do { \
  1307 + if (x == (int8_t)0x80) { \
  1308 + x = 0x7f; \
  1309 + SET_QC(); \
  1310 + } else { \
  1311 + x = -x; \
  1312 + }} while (0)
  1313 +uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
  1314 +{
  1315 + neon_s8 vec;
  1316 + NEON_UNPACK(neon_s8, vec, x);
  1317 + DO_QNEG8(vec.v1);
  1318 + DO_QNEG8(vec.v2);
  1319 + DO_QNEG8(vec.v3);
  1320 + DO_QNEG8(vec.v4);
  1321 + NEON_PACK(neon_s8, x, vec);
  1322 + return x;
  1323 +}
  1324 +#undef DO_QNEG8
  1325 +
  1326 +#define DO_QABS16(x) do { \
  1327 + if (x == (int16_t)0x8000) { \
  1328 + x = 0x7fff; \
  1329 + SET_QC(); \
  1330 + } else if (x < 0) { \
  1331 + x = -x; \
  1332 + }} while (0)
  1333 +uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
  1334 +{
  1335 + neon_s16 vec;
  1336 + NEON_UNPACK(neon_s16, vec, x);
  1337 + DO_QABS16(vec.v1);
  1338 + DO_QABS16(vec.v2);
  1339 + NEON_PACK(neon_s16, x, vec);
  1340 + return x;
  1341 +}
  1342 +#undef DO_QABS16
  1343 +
  1344 +#define DO_QNEG16(x) do { \
  1345 + if (x == (int16_t)0x8000) { \
  1346 + x = 0x7fff; \
  1347 + SET_QC(); \
  1348 + } else { \
  1349 + x = -x; \
  1350 + }} while (0)
  1351 +uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
  1352 +{
  1353 + neon_s16 vec;
  1354 + NEON_UNPACK(neon_s16, vec, x);
  1355 + DO_QNEG16(vec.v1);
  1356 + DO_QNEG16(vec.v2);
  1357 + NEON_PACK(neon_s16, x, vec);
  1358 + return x;
  1359 +}
  1360 +#undef DO_QNEG16
  1361 +
  1362 +uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
  1363 +{
  1364 + if (x == SIGNBIT) {
  1365 + SET_QC();
  1366 + x = ~SIGNBIT;
  1367 + } else if ((int32_t)x < 0) {
  1368 + x = -x;
  1369 + }
  1370 + return x;
  1371 +}
  1372 +
  1373 +uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
  1374 +{
  1375 + if (x == SIGNBIT) {
  1376 + SET_QC();
  1377 + x = ~SIGNBIT;
  1378 + } else {
  1379 + x = -x;
  1380 + }
  1381 + return x;
  1382 +}
  1383 +
  1384 +/* NEON Float helpers. */
  1385 +uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
  1386 +{
  1387 + float32 f0 = vfp_itos(a);
  1388 + float32 f1 = vfp_itos(b);
  1389 + return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
  1390 +}
  1391 +
  1392 +uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
  1393 +{
  1394 + float32 f0 = vfp_itos(a);
  1395 + float32 f1 = vfp_itos(b);
  1396 + return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
  1397 +}
  1398 +
  1399 +uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
  1400 +{
  1401 + float32 f0 = vfp_itos(a);
  1402 + float32 f1 = vfp_itos(b);
  1403 + return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
  1404 + ? float32_sub(f0, f1, NFS)
  1405 + : float32_sub(f1, f0, NFS));
  1406 +}
  1407 +
  1408 +uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
  1409 +{
  1410 + return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
  1411 +}
  1412 +
  1413 +uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
  1414 +{
  1415 + return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
  1416 +}
  1417 +
  1418 +uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
  1419 +{
  1420 + return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
  1421 +}
  1422 +
  1423 +/* Floating point comparisons produce an integer result. */
  1424 +#define NEON_VOP_FCMP(name, cmp) \
  1425 +uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
  1426 +{ \
  1427 + if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
  1428 + return ~0; \
  1429 + else \
  1430 + return 0; \
  1431 +}
  1432 +
  1433 +NEON_VOP_FCMP(ceq_f32, ==)
  1434 +NEON_VOP_FCMP(cge_f32, >=)
  1435 +NEON_VOP_FCMP(cgt_f32, >)
  1436 +
  1437 +uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
  1438 +{
  1439 + float32 f0 = float32_abs(vfp_itos(a));
  1440 + float32 f1 = float32_abs(vfp_itos(b));
  1441 + return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
  1442 +}
  1443 +
  1444 +uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
  1445 +{
  1446 + float32 f0 = float32_abs(vfp_itos(a));
  1447 + float32 f1 = float32_abs(vfp_itos(b));
  1448 + return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
  1449 +}
target-arm/op.c
@@ -32,7 +32,5 @@ @@ -32,7 +32,5 @@
32 #include "op_mem.h" 32 #include "op_mem.h"
33 #endif 33 #endif
34 34
35 -#include "op_neon.h"  
36 -  
37 /* iwMMXt support */ 35 /* iwMMXt support */
38 #include "op_iwmmxt.c" 36 #include "op_iwmmxt.c"
target-arm/op_helper.c
@@ -20,6 +20,9 @@ @@ -20,6 +20,9 @@
20 #include "exec.h" 20 #include "exec.h"
21 #include "helpers.h" 21 #include "helpers.h"
22 22
  23 +#define SIGNBIT (uint32_t)0x80000000
  24 +#define SIGNBIT64 ((uint64_t)1 << 63)
  25 +
23 void raise_exception(int tt) 26 void raise_exception(int tt)
24 { 27 {
25 env->exception_index = tt; 28 env->exception_index = tt;
@@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr) @@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr)
116 } 119 }
117 #endif 120 #endif
118 121
119 -#define SIGNBIT (uint32_t)0x80000000 122 +/* FIXME: Pass an axplicit pointer to QF to CPUState, and move saturating
  123 + instructions into helper.c */
120 uint32_t HELPER(add_setq)(uint32_t a, uint32_t b) 124 uint32_t HELPER(add_setq)(uint32_t a, uint32_t b)
121 { 125 {
122 uint32_t res = a + b; 126 uint32_t res = a + b;
@@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i) @@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i)
451 } 455 }
452 } 456 }
453 457
  458 +uint64_t HELPER(neon_add_saturate_s64)(uint64_t src1, uint64_t src2)
  459 +{
  460 + uint64_t res;
  461 +
  462 + res = src1 + src2;
  463 + if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
  464 + env->QF = 1;
  465 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  466 + }
  467 + return res;
  468 +}
  469 +
  470 +uint64_t HELPER(neon_add_saturate_u64)(uint64_t src1, uint64_t src2)
  471 +{
  472 + uint64_t res;
  473 +
  474 + res = src1 + src2;
  475 + if (res < src1) {
  476 + env->QF = 1;
  477 + res = ~(uint64_t)0;
  478 + }
  479 + return res;
  480 +}
  481 +
  482 +uint64_t HELPER(neon_sub_saturate_s64)(uint64_t src1, uint64_t src2)
  483 +{
  484 + uint64_t res;
  485 +
  486 + res = src1 - src2;
  487 + if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
  488 + env->QF = 1;
  489 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  490 + }
  491 + return res;
  492 +}
  493 +
  494 +uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2)
  495 +{
  496 + uint64_t res;
  497 +
  498 + if (src1 < src2) {
  499 + env->QF = 1;
  500 + res = 0;
  501 + } else {
  502 + res = src1 - src2;
  503 + }
  504 + return res;
  505 +}
  506 +
  507 +/* These need to return a pair of value, so still use T0/T1. */
  508 +/* Transpose. Argument order is rather strange to avoid special casing
  509 + the tranlation code.
  510 + On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
  511 +void HELPER(neon_trn_u8)(void)
  512 +{
  513 + uint32_t rd;
  514 + uint32_t rm;
  515 + rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
  516 + rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
  517 + T0 = rd;
  518 + T1 = rm;
  519 + FORCE_RET();
  520 +}
  521 +
  522 +void HELPER(neon_trn_u16)(void)
  523 +{
  524 + uint32_t rd;
  525 + uint32_t rm;
  526 + rd = (T0 << 16) | (T1 & 0xffff);
  527 + rm = (T1 >> 16) | (T0 & 0xffff0000);
  528 + T0 = rd;
  529 + T1 = rm;
  530 + FORCE_RET();
  531 +}
  532 +
  533 +/* Worker routines for zip and unzip. */
  534 +void HELPER(neon_unzip_u8)(void)
  535 +{
  536 + uint32_t rd;
  537 + uint32_t rm;
  538 + rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
  539 + | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
  540 + rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
  541 + | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
  542 + T0 = rd;
  543 + T1 = rm;
  544 + FORCE_RET();
  545 +}
  546 +
  547 +void HELPER(neon_zip_u8)(void)
  548 +{
  549 + uint32_t rd;
  550 + uint32_t rm;
  551 + rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
  552 + | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
  553 + rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
  554 + | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
  555 + T0 = rd;
  556 + T1 = rm;
  557 + FORCE_RET();
  558 +}
  559 +
  560 +void HELPER(neon_zip_u16)(void)
  561 +{
  562 + uint32_t tmp;
  563 +
  564 + tmp = (T0 & 0xffff) | (T1 << 16);
  565 + T1 = (T1 & 0xffff0000) | (T0 >> 16);
  566 + T0 = tmp;
  567 + FORCE_RET();
  568 +}
target-arm/op_neon.h deleted 100644 โ†’ 0
1 -/*  
2 - * ARM NEON vector operations.  
3 - *  
4 - * Copyright (c) 2007 CodeSourcery.  
5 - * Written by Paul Brook  
6 - *  
7 - * This code is licenced under the GPL.  
8 - */  
9 -/* Note that for NEON an "l" prefix means it is a wide operation, unlike  
10 - scalar arm ops where it means a word size operation. */  
11 -  
12 -#define SIGNBIT (uint32_t)0x80000000  
13 -/* ??? NEON ops should probably have their own float status. */  
14 -#define NFS &env->vfp.fp_status  
15 -#define NEON_OP(name) void OPPROTO op_neon_##name (void)  
16 -  
17 -/* Helper routines to perform bitwise copies between float and int. */  
18 -static inline float32 vfp_itos(uint32_t i)  
19 -{  
20 - union {  
21 - uint32_t i;  
22 - float32 s;  
23 - } v;  
24 -  
25 - v.i = i;  
26 - return v.s;  
27 -}  
28 -  
29 -static inline uint32_t vfp_stoi(float32 s)  
30 -{  
31 - union {  
32 - uint32_t i;  
33 - float32 s;  
34 - } v;  
35 -  
36 - v.s = s;  
37 - return v.i;  
38 -}  
39 -  
40 -NEON_OP(getreg_T0)  
41 -{  
42 - T0 = *(uint32_t *)((char *) env + PARAM1);  
43 -}  
44 -  
45 -NEON_OP(getreg_T1)  
46 -{  
47 - T1 = *(uint32_t *)((char *) env + PARAM1);  
48 -}  
49 -  
50 -NEON_OP(setreg_T0)  
51 -{  
52 - *(uint32_t *)((char *) env + PARAM1) = T0;  
53 -}  
54 -  
55 -NEON_OP(setreg_T1)  
56 -{  
57 - *(uint32_t *)((char *) env + PARAM1) = T1;  
58 -}  
59 -  
60 -#define NEON_TYPE1(name, type) \  
61 -typedef struct \  
62 -{ \  
63 - type v1; \  
64 -} neon_##name;  
65 -#ifdef WORDS_BIGENDIAN  
66 -#define NEON_TYPE2(name, type) \  
67 -typedef struct \  
68 -{ \  
69 - type v2; \  
70 - type v1; \  
71 -} neon_##name;  
72 -#define NEON_TYPE4(name, type) \  
73 -typedef struct \  
74 -{ \  
75 - type v4; \  
76 - type v3; \  
77 - type v2; \  
78 - type v1; \  
79 -} neon_##name;  
80 -#else  
81 -#define NEON_TYPE2(name, type) \  
82 -typedef struct \  
83 -{ \  
84 - type v1; \  
85 - type v2; \  
86 -} neon_##name;  
87 -#define NEON_TYPE4(name, type) \  
88 -typedef struct \  
89 -{ \  
90 - type v1; \  
91 - type v2; \  
92 - type v3; \  
93 - type v4; \  
94 -} neon_##name;  
95 -#endif  
96 -  
97 -NEON_TYPE4(s8, int8_t)  
98 -NEON_TYPE4(u8, uint8_t)  
99 -NEON_TYPE2(s16, int16_t)  
100 -NEON_TYPE2(u16, uint16_t)  
101 -NEON_TYPE1(s32, int32_t)  
102 -NEON_TYPE1(u32, uint32_t)  
103 -#undef NEON_TYPE4  
104 -#undef NEON_TYPE2  
105 -#undef NEON_TYPE1  
106 -  
107 -/* Copy from a uint32_t to a vector structure type. */  
108 -#define NEON_UNPACK(vtype, dest, val) do { \  
109 - union { \  
110 - vtype v; \  
111 - uint32_t i; \  
112 - } conv_u; \  
113 - conv_u.i = (val); \  
114 - dest = conv_u.v; \  
115 - } while(0)  
116 -  
117 -/* Copy from a vector structure type to a uint32_t. */  
118 -#define NEON_PACK(vtype, dest, val) do { \  
119 - union { \  
120 - vtype v; \  
121 - uint32_t i; \  
122 - } conv_u; \  
123 - conv_u.v = (val); \  
124 - dest = conv_u.i; \  
125 - } while(0)  
126 -  
127 -#define NEON_DO1 \  
128 - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);  
129 -#define NEON_DO2 \  
130 - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \  
131 - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);  
132 -#define NEON_DO4 \  
133 - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \  
134 - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \  
135 - NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \  
136 - NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);  
137 -  
138 -#define NEON_VOP(name, vtype, n) \  
139 -NEON_OP(name) \  
140 -{ \  
141 - vtype vsrc1; \  
142 - vtype vsrc2; \  
143 - vtype vdest; \  
144 - NEON_UNPACK(vtype, vsrc1, T0); \  
145 - NEON_UNPACK(vtype, vsrc2, T1); \  
146 - NEON_DO##n; \  
147 - NEON_PACK(vtype, T0, vdest); \  
148 - FORCE_RET(); \  
149 -}  
150 -  
151 -#define NEON_VOP1(name, vtype, n) \  
152 -NEON_OP(name) \  
153 -{ \  
154 - vtype vsrc1; \  
155 - vtype vdest; \  
156 - NEON_UNPACK(vtype, vsrc1, T0); \  
157 - NEON_DO##n; \  
158 - NEON_PACK(vtype, T0, vdest); \  
159 - FORCE_RET(); \  
160 -}  
161 -  
162 -/* Pairwise operations. */  
163 -/* For 32-bit elements each segment only contains a single element, so  
164 - the elementwise and pairwise operations are the same. */  
165 -#define NEON_PDO2 \  
166 - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \  
167 - NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);  
168 -#define NEON_PDO4 \  
169 - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \  
170 - NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \  
171 - NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \  
172 - NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \  
173 -  
174 -#define NEON_POP(name, vtype, n) \  
175 -NEON_OP(name) \  
176 -{ \  
177 - vtype vsrc1; \  
178 - vtype vsrc2; \  
179 - vtype vdest; \  
180 - NEON_UNPACK(vtype, vsrc1, T0); \  
181 - NEON_UNPACK(vtype, vsrc2, T1); \  
182 - NEON_PDO##n; \  
183 - NEON_PACK(vtype, T0, vdest); \  
184 - FORCE_RET(); \  
185 -}  
186 -  
187 -#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1  
188 -NEON_VOP(hadd_s8, neon_s8, 4)  
189 -NEON_VOP(hadd_u8, neon_u8, 4)  
190 -NEON_VOP(hadd_s16, neon_s16, 2)  
191 -NEON_VOP(hadd_u16, neon_u16, 2)  
192 -#undef NEON_FN  
193 -  
194 -NEON_OP(hadd_s32)  
195 -{  
196 - int32_t src1 = T0;  
197 - int32_t src2 = T1;  
198 - int32_t dest;  
199 -  
200 - dest = (src1 >> 1) + (src2 >> 1);  
201 - if (src1 & src2 & 1)  
202 - dest++;  
203 - T0 = dest;  
204 - FORCE_RET();  
205 -}  
206 -  
207 -NEON_OP(hadd_u32)  
208 -{  
209 - uint32_t src1 = T0;  
210 - uint32_t src2 = T1;  
211 - uint32_t dest;  
212 -  
213 - dest = (src1 >> 1) + (src2 >> 1);  
214 - if (src1 & src2 & 1)  
215 - dest++;  
216 - T0 = dest;  
217 - FORCE_RET();  
218 -}  
219 -  
220 -#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1  
221 -NEON_VOP(rhadd_s8, neon_s8, 4)  
222 -NEON_VOP(rhadd_u8, neon_u8, 4)  
223 -NEON_VOP(rhadd_s16, neon_s16, 2)  
224 -NEON_VOP(rhadd_u16, neon_u16, 2)  
225 -#undef NEON_FN  
226 -  
227 -NEON_OP(rhadd_s32)  
228 -{  
229 - int32_t src1 = T0;  
230 - int32_t src2 = T1;  
231 - int32_t dest;  
232 -  
233 - dest = (src1 >> 1) + (src2 >> 1);  
234 - if ((src1 | src2) & 1)  
235 - dest++;  
236 - T0 = dest;  
237 - FORCE_RET();  
238 -}  
239 -  
240 -NEON_OP(rhadd_u32)  
241 -{  
242 - uint32_t src1 = T0;  
243 - uint32_t src2 = T1;  
244 - uint32_t dest;  
245 -  
246 - dest = (src1 >> 1) + (src2 >> 1);  
247 - if ((src1 | src2) & 1)  
248 - dest++;  
249 - T0 = dest;  
250 - FORCE_RET();  
251 -}  
252 -  
253 -#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1  
254 -NEON_VOP(hsub_s8, neon_s8, 4)  
255 -NEON_VOP(hsub_u8, neon_u8, 4)  
256 -NEON_VOP(hsub_s16, neon_s16, 2)  
257 -NEON_VOP(hsub_u16, neon_u16, 2)  
258 -#undef NEON_FN  
259 -  
260 -NEON_OP(hsub_s32)  
261 -{  
262 - int32_t src1 = T0;  
263 - int32_t src2 = T1;  
264 - int32_t dest;  
265 -  
266 - dest = (src1 >> 1) - (src2 >> 1);  
267 - if ((~src1) & src2 & 1)  
268 - dest--;  
269 - T0 = dest;  
270 - FORCE_RET();  
271 -}  
272 -  
273 -NEON_OP(hsub_u32)  
274 -{  
275 - uint32_t src1 = T0;  
276 - uint32_t src2 = T1;  
277 - uint32_t dest;  
278 -  
279 - dest = (src1 >> 1) - (src2 >> 1);  
280 - if ((~src1) & src2 & 1)  
281 - dest--;  
282 - T0 = dest;  
283 - FORCE_RET();  
284 -}  
285 -  
286 -#define NEON_USAT(dest, src1, src2, type) do { \  
287 - uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \  
288 - if (tmp != (type)tmp) { \  
289 - env->QF = 1; \  
290 - dest = ~0; \  
291 - } else { \  
292 - dest = tmp; \  
293 - }} while(0)  
294 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)  
295 -NEON_VOP(qadd_u8, neon_u8, 4)  
296 -#undef NEON_FN  
297 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)  
298 -NEON_VOP(qadd_u16, neon_u16, 2)  
299 -#undef NEON_FN  
300 -#undef NEON_USAT  
301 -  
302 -#define NEON_SSAT(dest, src1, src2, type) do { \  
303 - int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \  
304 - if (tmp != (type)tmp) { \  
305 - env->QF = 1; \  
306 - if (src2 > 0) { \  
307 - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \  
308 - } else { \  
309 - tmp = 1 << (sizeof(type) * 8 - 1); \  
310 - } \  
311 - } \  
312 - dest = tmp; \  
313 - } while(0)  
314 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)  
315 -NEON_VOP(qadd_s8, neon_s8, 4)  
316 -#undef NEON_FN  
317 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)  
318 -NEON_VOP(qadd_s16, neon_s16, 2)  
319 -#undef NEON_FN  
320 -#undef NEON_SSAT  
321 -  
322 -#define NEON_USAT(dest, src1, src2, type) do { \  
323 - uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \  
324 - if (tmp != (type)tmp) { \  
325 - env->QF = 1; \  
326 - dest = 0; \  
327 - } else { \  
328 - dest = tmp; \  
329 - }} while(0)  
330 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)  
331 -NEON_VOP(qsub_u8, neon_u8, 4)  
332 -#undef NEON_FN  
333 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)  
334 -NEON_VOP(qsub_u16, neon_u16, 2)  
335 -#undef NEON_FN  
336 -#undef NEON_USAT  
337 -  
338 -#define NEON_SSAT(dest, src1, src2, type) do { \  
339 - int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \  
340 - if (tmp != (type)tmp) { \  
341 - env->QF = 1; \  
342 - if (src2 < 0) { \  
343 - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \  
344 - } else { \  
345 - tmp = 1 << (sizeof(type) * 8 - 1); \  
346 - } \  
347 - } \  
348 - dest = tmp; \  
349 - } while(0)  
350 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)  
351 -NEON_VOP(qsub_s8, neon_s8, 4)  
352 -#undef NEON_FN  
353 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)  
354 -NEON_VOP(qsub_s16, neon_s16, 2)  
355 -#undef NEON_FN  
356 -#undef NEON_SSAT  
357 -  
358 -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0  
359 -NEON_VOP(cgt_s8, neon_s8, 4)  
360 -NEON_VOP(cgt_u8, neon_u8, 4)  
361 -NEON_VOP(cgt_s16, neon_s16, 2)  
362 -NEON_VOP(cgt_u16, neon_u16, 2)  
363 -NEON_VOP(cgt_s32, neon_s32, 1)  
364 -NEON_VOP(cgt_u32, neon_u32, 1)  
365 -#undef NEON_FN  
366 -  
367 -#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0  
368 -NEON_VOP(cge_s8, neon_s8, 4)  
369 -NEON_VOP(cge_u8, neon_u8, 4)  
370 -NEON_VOP(cge_s16, neon_s16, 2)  
371 -NEON_VOP(cge_u16, neon_u16, 2)  
372 -NEON_VOP(cge_s32, neon_s32, 1)  
373 -NEON_VOP(cge_u32, neon_u32, 1)  
374 -#undef NEON_FN  
375 -  
376 -#define NEON_FN(dest, src1, src2) do { \  
377 - int8_t tmp; \  
378 - tmp = (int8_t)src2; \  
379 - if (tmp < 0) { \  
380 - dest = src1 >> -tmp; \  
381 - } else { \  
382 - dest = src1 << tmp; \  
383 - }} while (0)  
384 -NEON_VOP(shl_s8, neon_s8, 4)  
385 -NEON_VOP(shl_u8, neon_u8, 4)  
386 -NEON_VOP(shl_s16, neon_s16, 2)  
387 -NEON_VOP(shl_u16, neon_u16, 2)  
388 -NEON_VOP(shl_s32, neon_s32, 1)  
389 -NEON_VOP(shl_u32, neon_u32, 1)  
390 -#undef NEON_FN  
391 -  
392 -NEON_OP(shl_u64)  
393 -{  
394 - int8_t shift = env->vfp.scratch[0];  
395 - uint64_t val = T0 | ((uint64_t)T1 << 32);  
396 - if (shift < 0) {  
397 - val >>= -shift;  
398 - } else {  
399 - val <<= shift;  
400 - }  
401 - T0 = val;  
402 - T1 = val >> 32;  
403 - FORCE_RET();  
404 -}  
405 -  
406 -NEON_OP(shl_s64)  
407 -{  
408 - int8_t shift = env->vfp.scratch[0];  
409 - int64_t val = T0 | ((uint64_t)T1 << 32);  
410 - if (shift < 0) {  
411 - val >>= -shift;  
412 - } else {  
413 - val <<= shift;  
414 - }  
415 - T0 = val;  
416 - T1 = val >> 32;  
417 - FORCE_RET();  
418 -}  
419 -  
420 -#define NEON_FN(dest, src1, src2) do { \  
421 - int8_t tmp; \  
422 - tmp = (int8_t)src1; \  
423 - if (tmp < 0) { \  
424 - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \  
425 - } else { \  
426 - dest = src2 << tmp; \  
427 - }} while (0)  
428 -  
429 -NEON_VOP(rshl_s8, neon_s8, 4)  
430 -NEON_VOP(rshl_u8, neon_u8, 4)  
431 -NEON_VOP(rshl_s16, neon_s16, 2)  
432 -NEON_VOP(rshl_u16, neon_u16, 2)  
433 -NEON_VOP(rshl_s32, neon_s32, 1)  
434 -NEON_VOP(rshl_u32, neon_u32, 1)  
435 -#undef NEON_FN  
436 -  
437 -NEON_OP(rshl_u64)  
438 -{  
439 - int8_t shift = env->vfp.scratch[0];  
440 - uint64_t val = T0 | ((uint64_t)T1 << 32);  
441 - if (shift < 0) {  
442 - val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;  
443 - val >>= -shift;  
444 - } else {  
445 - val <<= shift;  
446 - }  
447 - T0 = val;  
448 - T1 = val >> 32;  
449 - FORCE_RET();  
450 -}  
451 -  
452 -NEON_OP(rshl_s64)  
453 -{  
454 - int8_t shift = env->vfp.scratch[0];  
455 - int64_t val = T0 | ((uint64_t)T1 << 32);  
456 - if (shift < 0) {  
457 - val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;  
458 - } else {  
459 - val <<= shift;  
460 - }  
461 - T0 = val;  
462 - T1 = val >> 32;  
463 - FORCE_RET();  
464 -}  
465 -  
466 -#define NEON_FN(dest, src1, src2) do { \  
467 - int8_t tmp; \  
468 - tmp = (int8_t)src1; \  
469 - if (tmp < 0) { \  
470 - dest = src2 >> -tmp; \  
471 - } else { \  
472 - dest = src2 << tmp; \  
473 - if ((dest >> tmp) != src2) { \  
474 - env->QF = 1; \  
475 - dest = ~0; \  
476 - } \  
477 - }} while (0)  
478 -NEON_VOP(qshl_s8, neon_s8, 4)  
479 -NEON_VOP(qshl_s16, neon_s16, 2)  
480 -NEON_VOP(qshl_s32, neon_s32, 1)  
481 -#undef NEON_FN  
482 -  
483 -NEON_OP(qshl_s64)  
484 -{  
485 - int8_t shift = env->vfp.scratch[0];  
486 - int64_t val = T0 | ((uint64_t)T1 << 32);  
487 - if (shift < 0) {  
488 - val >>= -shift;  
489 - } else {  
490 - int64_t tmp = val;  
491 - val <<= shift;  
492 - if ((val >> shift) != tmp) {  
493 - env->QF = 1;  
494 - val = (tmp >> 63) ^ 0x7fffffffffffffffULL;  
495 - }  
496 - }  
497 - T0 = val;  
498 - T1 = val >> 32;  
499 - FORCE_RET();  
500 -}  
501 -  
502 -#define NEON_FN(dest, src1, src2) do { \  
503 - int8_t tmp; \  
504 - tmp = (int8_t)src1; \  
505 - if (tmp < 0) { \  
506 - dest = src2 >> -tmp; \  
507 - } else { \  
508 - dest = src2 << tmp; \  
509 - if ((dest >> tmp) != src2) { \  
510 - env->QF = 1; \  
511 - dest = src2 >> 31; \  
512 - } \  
513 - }} while (0)  
514 -NEON_VOP(qshl_u8, neon_u8, 4)  
515 -NEON_VOP(qshl_u16, neon_u16, 2)  
516 -NEON_VOP(qshl_u32, neon_u32, 1)  
517 -#undef NEON_FN  
518 -  
519 -NEON_OP(qshl_u64)  
520 -{  
521 - int8_t shift = env->vfp.scratch[0];  
522 - uint64_t val = T0 | ((uint64_t)T1 << 32);  
523 - if (shift < 0) {  
524 - val >>= -shift;  
525 - } else {  
526 - uint64_t tmp = val;  
527 - val <<= shift;  
528 - if ((val >> shift) != tmp) {  
529 - env->QF = 1;  
530 - val = ~(uint64_t)0;  
531 - }  
532 - }  
533 - T0 = val;  
534 - T1 = val >> 32;  
535 - FORCE_RET();  
536 -}  
537 -  
538 -#define NEON_FN(dest, src1, src2) do { \  
539 - int8_t tmp; \  
540 - tmp = (int8_t)src1; \  
541 - if (tmp < 0) { \  
542 - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \  
543 - } else { \  
544 - dest = src2 << tmp; \  
545 - if ((dest >> tmp) != src2) { \  
546 - dest = ~0; \  
547 - } \  
548 - }} while (0)  
549 -NEON_VOP(qrshl_s8, neon_s8, 4)  
550 -NEON_VOP(qrshl_s16, neon_s16, 2)  
551 -NEON_VOP(qrshl_s32, neon_s32, 1)  
552 -#undef NEON_FN  
553 -  
554 -#define NEON_FN(dest, src1, src2) do { \  
555 - int8_t tmp; \  
556 - tmp = (int8_t)src1; \  
557 - if (tmp < 0) { \  
558 - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \  
559 - } else { \  
560 - dest = src2 << tmp; \  
561 - if ((dest >> tmp) != src2) { \  
562 - env->QF = 1; \  
563 - dest = src2 >> 31; \  
564 - } \  
565 - }} while (0)  
566 -NEON_VOP(qrshl_u8, neon_u8, 4)  
567 -NEON_VOP(qrshl_u16, neon_u16, 2)  
568 -NEON_VOP(qrshl_u32, neon_u32, 1)  
569 -#undef NEON_FN  
570 -  
571 -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2  
572 -NEON_VOP(max_s8, neon_s8, 4)  
573 -NEON_VOP(max_u8, neon_u8, 4)  
574 -NEON_VOP(max_s16, neon_s16, 2)  
575 -NEON_VOP(max_u16, neon_u16, 2)  
576 -NEON_VOP(max_s32, neon_s32, 1)  
577 -NEON_VOP(max_u32, neon_u32, 1)  
578 -NEON_POP(pmax_s8, neon_s8, 4)  
579 -NEON_POP(pmax_u8, neon_u8, 4)  
580 -NEON_POP(pmax_s16, neon_s16, 2)  
581 -NEON_POP(pmax_u16, neon_u16, 2)  
582 -#undef NEON_FN  
583 -  
584 -NEON_OP(max_f32)  
585 -{  
586 - float32 f0 = vfp_itos(T0);  
587 - float32 f1 = vfp_itos(T1);  
588 - T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;  
589 - FORCE_RET();  
590 -}  
591 -  
592 -#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2  
593 -NEON_VOP(min_s8, neon_s8, 4)  
594 -NEON_VOP(min_u8, neon_u8, 4)  
595 -NEON_VOP(min_s16, neon_s16, 2)  
596 -NEON_VOP(min_u16, neon_u16, 2)  
597 -NEON_VOP(min_s32, neon_s32, 1)  
598 -NEON_VOP(min_u32, neon_u32, 1)  
599 -NEON_POP(pmin_s8, neon_s8, 4)  
600 -NEON_POP(pmin_u8, neon_u8, 4)  
601 -NEON_POP(pmin_s16, neon_s16, 2)  
602 -NEON_POP(pmin_u16, neon_u16, 2)  
603 -#undef NEON_FN  
604 -  
605 -NEON_OP(min_f32)  
606 -{  
607 - float32 f0 = vfp_itos(T0);  
608 - float32 f1 = vfp_itos(T1);  
609 - T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;  
610 - FORCE_RET();  
611 -}  
612 -  
613 -#define NEON_FN(dest, src1, src2) \  
614 - dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)  
615 -NEON_VOP(abd_s8, neon_s8, 4)  
616 -NEON_VOP(abd_u8, neon_u8, 4)  
617 -NEON_VOP(abd_s16, neon_s16, 2)  
618 -NEON_VOP(abd_u16, neon_u16, 2)  
619 -NEON_VOP(abd_s32, neon_s32, 1)  
620 -NEON_VOP(abd_u32, neon_u32, 1)  
621 -#undef NEON_FN  
622 -  
623 -NEON_OP(abd_f32)  
624 -{  
625 - float32 f0 = vfp_itos(T0);  
626 - float32 f1 = vfp_itos(T1);  
627 - T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)  
628 - ? float32_sub(f0, f1, NFS)  
629 - : float32_sub(f1, f0, NFS));  
630 - FORCE_RET();  
631 -}  
632 -  
633 -#define NEON_FN(dest, src1, src2) dest = src1 + src2  
634 -NEON_VOP(add_u8, neon_u8, 4)  
635 -NEON_VOP(add_u16, neon_u16, 2)  
636 -NEON_POP(padd_u8, neon_u8, 4)  
637 -NEON_POP(padd_u16, neon_u16, 2)  
638 -#undef NEON_FN  
639 -  
640 -NEON_OP(add_f32)  
641 -{  
642 - T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));  
643 - FORCE_RET();  
644 -}  
645 -  
646 -#define NEON_FN(dest, src1, src2) dest = src1 - src2  
647 -NEON_VOP(sub_u8, neon_u8, 4)  
648 -NEON_VOP(sub_u16, neon_u16, 2)  
649 -#undef NEON_FN  
650 -  
651 -NEON_OP(sub_f32)  
652 -{  
653 - T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));  
654 - FORCE_RET();  
655 -}  
656 -  
657 -#define NEON_FN(dest, src1, src2) dest = src2 - src1  
658 -NEON_VOP(rsb_u8, neon_u8, 4)  
659 -NEON_VOP(rsb_u16, neon_u16, 2)  
660 -#undef NEON_FN  
661 -  
662 -NEON_OP(rsb_f32)  
663 -{  
664 - T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));  
665 - FORCE_RET();  
666 -}  
667 -  
668 -#define NEON_FN(dest, src1, src2) dest = src1 * src2  
669 -NEON_VOP(mul_u8, neon_u8, 4)  
670 -NEON_VOP(mul_u16, neon_u16, 2)  
671 -#undef NEON_FN  
672 -  
673 -NEON_OP(mul_f32)  
674 -{  
675 - T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));  
676 - FORCE_RET();  
677 -}  
678 -  
679 -NEON_OP(mul_p8)  
680 -{  
681 - T0 = helper_neon_mul_p8(T0, T1);  
682 -}  
683 -  
684 -#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0  
685 -NEON_VOP(tst_u8, neon_u8, 4)  
686 -NEON_VOP(tst_u16, neon_u16, 2)  
687 -NEON_VOP(tst_u32, neon_u32, 1)  
688 -#undef NEON_FN  
689 -  
690 -#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0  
691 -NEON_VOP(ceq_u8, neon_u8, 4)  
692 -NEON_VOP(ceq_u16, neon_u16, 2)  
693 -NEON_VOP(ceq_u32, neon_u32, 1)  
694 -#undef NEON_FN  
695 -  
696 -#define NEON_QDMULH16(dest, src1, src2, round) do { \  
697 - uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \  
698 - if ((tmp ^ (tmp << 1)) & SIGNBIT) { \  
699 - env->QF = 1; \  
700 - tmp = (tmp >> 31) ^ ~SIGNBIT; \  
701 - } \  
702 - tmp <<= 1; \  
703 - if (round) { \  
704 - int32_t old = tmp; \  
705 - tmp += 1 << 15; \  
706 - if ((int32_t)tmp < old) { \  
707 - env->QF = 1; \  
708 - tmp = SIGNBIT - 1; \  
709 - } \  
710 - } \  
711 - dest = tmp >> 16; \  
712 - } while(0)  
713 -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)  
714 -NEON_VOP(qdmulh_s16, neon_s16, 2)  
715 -#undef NEON_FN  
716 -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)  
717 -NEON_VOP(qrdmulh_s16, neon_s16, 2)  
718 -#undef NEON_FN  
719 -#undef NEON_QDMULH16  
720 -  
721 -#define SIGNBIT64 ((uint64_t)1 << 63)  
722 -#define NEON_QDMULH32(dest, src1, src2, round) do { \  
723 - uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \  
724 - if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \  
725 - env->QF = 1; \  
726 - tmp = (tmp >> 63) ^ ~SIGNBIT64; \  
727 - } else { \  
728 - tmp <<= 1; \  
729 - } \  
730 - if (round) { \  
731 - int64_t old = tmp; \  
732 - tmp += (int64_t)1 << 31; \  
733 - if ((int64_t)tmp < old) { \  
734 - env->QF = 1; \  
735 - tmp = SIGNBIT64 - 1; \  
736 - } \  
737 - } \  
738 - dest = tmp >> 32; \  
739 - } while(0)  
740 -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)  
741 -NEON_VOP(qdmulh_s32, neon_s32, 1)  
742 -#undef NEON_FN  
743 -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)  
744 -NEON_VOP(qrdmulh_s32, neon_s32, 1)  
745 -#undef NEON_FN  
746 -#undef NEON_QDMULH32  
747 -  
748 -/* Floating point comparisons produce an integer result. */  
749 -#define NEON_VOP_FCMP(name, cmp) \  
750 -NEON_OP(name) \  
751 -{ \  
752 - if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \  
753 - T0 = -1; \  
754 - else \  
755 - T0 = 0; \  
756 - FORCE_RET(); \  
757 -}  
758 -  
759 -NEON_VOP_FCMP(ceq_f32, ==)  
760 -NEON_VOP_FCMP(cge_f32, >=)  
761 -NEON_VOP_FCMP(cgt_f32, >)  
762 -  
763 -NEON_OP(acge_f32)  
764 -{  
765 - float32 f0 = float32_abs(vfp_itos(T0));  
766 - float32 f1 = float32_abs(vfp_itos(T1));  
767 - T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;  
768 - FORCE_RET();  
769 -}  
770 -  
771 -NEON_OP(acgt_f32)  
772 -{  
773 - float32 f0 = float32_abs(vfp_itos(T0));  
774 - float32 f1 = float32_abs(vfp_itos(T1));  
775 - T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;  
776 - FORCE_RET();  
777 -}  
778 -  
779 -/* Narrowing instructions. The named type is the destination type. */  
780 -NEON_OP(narrow_u8)  
781 -{  
782 - T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)  
783 - | ((T1 << 16) & 0xff0000) | (T1 << 24);  
784 - FORCE_RET();  
785 -}  
786 -  
787 -NEON_OP(narrow_sat_u8)  
788 -{  
789 - neon_u16 src;  
790 - neon_u8 dest;  
791 -#define SAT8(d, s) \  
792 - if (s > 0xff) { \  
793 - d = 0xff; \  
794 - env->QF = 1; \  
795 - } else { \  
796 - d = s; \  
797 - }  
798 -  
799 - NEON_UNPACK(neon_u16, src, T0);  
800 - SAT8(dest.v1, src.v1);  
801 - SAT8(dest.v2, src.v2);  
802 - NEON_UNPACK(neon_u16, src, T1);  
803 - SAT8(dest.v3, src.v1);  
804 - SAT8(dest.v4, src.v2);  
805 - NEON_PACK(neon_u8, T0, dest);  
806 - FORCE_RET();  
807 -#undef SAT8  
808 -}  
809 -  
810 -NEON_OP(narrow_sat_s8)  
811 -{  
812 - neon_s16 src;  
813 - neon_s8 dest;  
814 -#define SAT8(d, s) \  
815 - if (s != (uint8_t)s) { \  
816 - d = (s >> 15) ^ 0x7f; \  
817 - env->QF = 1; \  
818 - } else { \  
819 - d = s; \  
820 - }  
821 -  
822 - NEON_UNPACK(neon_s16, src, T0);  
823 - SAT8(dest.v1, src.v1);  
824 - SAT8(dest.v2, src.v2);  
825 - NEON_UNPACK(neon_s16, src, T1);  
826 - SAT8(dest.v3, src.v1);  
827 - SAT8(dest.v4, src.v2);  
828 - NEON_PACK(neon_s8, T0, dest);  
829 - FORCE_RET();  
830 -#undef SAT8  
831 -}  
832 -  
833 -NEON_OP(narrow_u16)  
834 -{  
835 - T0 = (T0 & 0xffff) | (T1 << 16);  
836 -}  
837 -  
838 -NEON_OP(narrow_sat_u16)  
839 -{  
840 - if (T0 > 0xffff) {  
841 - T0 = 0xffff;  
842 - env->QF = 1;  
843 - }  
844 - if (T1 > 0xffff) {  
845 - T1 = 0xffff;  
846 - env->QF = 1;  
847 - }  
848 - T0 |= T1 << 16;  
849 - FORCE_RET();  
850 -}  
851 -  
852 -NEON_OP(narrow_sat_s16)  
853 -{  
854 - if ((int32_t)T0 != (int16_t)T0) {  
855 - T0 = ((int32_t)T0 >> 31) ^ 0x7fff;  
856 - env->QF = 1;  
857 - }  
858 - if ((int32_t)T1 != (int16_t) T1) {  
859 - T1 = ((int32_t)T1 >> 31) ^ 0x7fff;  
860 - env->QF = 1;  
861 - }  
862 - T0 = (uint16_t)T0 | (T1 << 16);  
863 - FORCE_RET();  
864 -}  
865 -  
866 -NEON_OP(narrow_sat_u32)  
867 -{  
868 - if (T1) {  
869 - T0 = 0xffffffffu;  
870 - env->QF = 1;  
871 - }  
872 - FORCE_RET();  
873 -}  
874 -  
875 -NEON_OP(narrow_sat_s32)  
876 -{  
877 - int32_t sign = (int32_t)T1 >> 31;  
878 -  
879 - if ((int32_t)T1 != sign) {  
880 - T0 = sign ^ 0x7fffffff;  
881 - env->QF = 1;  
882 - }  
883 - FORCE_RET();  
884 -}  
885 -  
886 -/* Narrowing instructions. Named type is the narrow type. */  
887 -NEON_OP(narrow_high_u8)  
888 -{  
889 - T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)  
890 - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);  
891 - FORCE_RET();  
892 -}  
893 -  
894 -NEON_OP(narrow_high_u16)  
895 -{  
896 - T0 = (T0 >> 16) | (T1 & 0xffff0000);  
897 - FORCE_RET();  
898 -}  
899 -  
900 -NEON_OP(narrow_high_round_u8)  
901 -{  
902 - T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)  
903 - | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);  
904 - FORCE_RET();  
905 -}  
906 -  
907 -NEON_OP(narrow_high_round_u16)  
908 -{  
909 - T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);  
910 - FORCE_RET();  
911 -}  
912 -  
913 -NEON_OP(narrow_high_round_u32)  
914 -{  
915 - if (T0 >= 0x80000000u)  
916 - T0 = T1 + 1;  
917 - else  
918 - T0 = T1;  
919 - FORCE_RET();  
920 -}  
921 -  
922 -/* Widening instructions. Named type is source type. */  
923 -NEON_OP(widen_s8)  
924 -{  
925 - uint32_t src;  
926 -  
927 - src = T0;  
928 - T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);  
929 - T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);  
930 -}  
931 -  
932 -NEON_OP(widen_u8)  
933 -{  
934 - T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);  
935 - T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);  
936 -}  
937 -  
938 -NEON_OP(widen_s16)  
939 -{  
940 - int32_t src;  
941 -  
942 - src = T0;  
943 - T0 = (int16_t)src;  
944 - T1 = src >> 16;  
945 -}  
946 -  
947 -NEON_OP(widen_u16)  
948 -{  
949 - T1 = T0 >> 16;  
950 - T0 &= 0xffff;  
951 -}  
952 -  
953 -NEON_OP(widen_s32)  
954 -{  
955 - T1 = (int32_t)T0 >> 31;  
956 - FORCE_RET();  
957 -}  
958 -  
959 -NEON_OP(widen_high_u8)  
960 -{  
961 - T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);  
962 - T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);  
963 -}  
964 -  
965 -NEON_OP(widen_high_u16)  
966 -{  
967 - T1 = T0 & 0xffff0000;  
968 - T0 <<= 16;  
969 -}  
970 -  
971 -/* Long operations. The type is the wide type. */  
972 -NEON_OP(shll_u16)  
973 -{  
974 - int shift = PARAM1;  
975 - uint32_t mask;  
976 -  
977 - mask = 0xffff >> (16 - shift);  
978 - mask |= mask << 16;  
979 - mask = ~mask;  
980 -  
981 - T0 = (T0 << shift) & mask;  
982 - T1 = (T1 << shift) & mask;  
983 - FORCE_RET();  
984 -}  
985 -  
986 -NEON_OP(shll_u64)  
987 -{  
988 - int shift = PARAM1;  
989 -  
990 - T1 <<= shift;  
991 - T1 |= T0 >> (32 - shift);  
992 - T0 <<= shift;  
993 - FORCE_RET();  
994 -}  
995 -  
996 -NEON_OP(addl_u16)  
997 -{  
998 - uint32_t tmp;  
999 - uint32_t high;  
1000 -  
1001 - tmp = env->vfp.scratch[0];  
1002 - high = (T0 >> 16) + (tmp >> 16);  
1003 - T0 = (uint16_t)(T0 + tmp);  
1004 - T0 |= (high << 16);  
1005 - tmp = env->vfp.scratch[1];  
1006 - high = (T1 >> 16) + (tmp >> 16);  
1007 - T1 = (uint16_t)(T1 + tmp);  
1008 - T1 |= (high << 16);  
1009 - FORCE_RET();  
1010 -}  
1011 -  
1012 -NEON_OP(addl_u32)  
1013 -{  
1014 - T0 += env->vfp.scratch[0];  
1015 - T1 += env->vfp.scratch[1];  
1016 - FORCE_RET();  
1017 -}  
1018 -  
1019 -NEON_OP(addl_u64)  
1020 -{  
1021 - uint64_t tmp;  
1022 - tmp = T0 | ((uint64_t)T1 << 32);  
1023 - tmp += env->vfp.scratch[0];  
1024 - tmp += (uint64_t)env->vfp.scratch[1] << 32;  
1025 - T0 = tmp;  
1026 - T1 = tmp >> 32;  
1027 - FORCE_RET();  
1028 -}  
1029 -  
1030 -NEON_OP(subl_u16)  
1031 -{  
1032 - uint32_t tmp;  
1033 - uint32_t high;  
1034 -  
1035 - tmp = env->vfp.scratch[0];  
1036 - high = (T0 >> 16) - (tmp >> 16);  
1037 - T0 = (uint16_t)(T0 - tmp);  
1038 - T0 |= (high << 16);  
1039 - tmp = env->vfp.scratch[1];  
1040 - high = (T1 >> 16) - (tmp >> 16);  
1041 - T1 = (uint16_t)(T1 - tmp);  
1042 - T1 |= (high << 16);  
1043 - FORCE_RET();  
1044 -}  
1045 -  
1046 -NEON_OP(subl_u32)  
1047 -{  
1048 - T0 -= env->vfp.scratch[0];  
1049 - T1 -= env->vfp.scratch[1];  
1050 - FORCE_RET();  
1051 -}  
1052 -  
1053 -NEON_OP(subl_u64)  
1054 -{  
1055 - uint64_t tmp;  
1056 - tmp = T0 | ((uint64_t)T1 << 32);  
1057 - tmp -= env->vfp.scratch[0];  
1058 - tmp -= (uint64_t)env->vfp.scratch[1] << 32;  
1059 - T0 = tmp;  
1060 - T1 = tmp >> 32;  
1061 - FORCE_RET();  
1062 -}  
1063 -  
1064 -#define DO_ABD(dest, x, y, type) do { \  
1065 - type tmp_x = x; \  
1066 - type tmp_y = y; \  
1067 - dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \  
1068 - } while(0)  
1069 -  
1070 -NEON_OP(abdl_u16)  
1071 -{  
1072 - uint32_t tmp;  
1073 - uint32_t low;  
1074 - uint32_t high;  
1075 -  
1076 - DO_ABD(low, T0, T1, uint8_t);  
1077 - DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);  
1078 - low |= tmp << 16;  
1079 - DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);  
1080 - DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);  
1081 - high |= tmp << 16;  
1082 - T0 = low;  
1083 - T1 = high;  
1084 - FORCE_RET();  
1085 -}  
1086 -  
1087 -NEON_OP(abdl_s16)  
1088 -{  
1089 - uint32_t tmp;  
1090 - uint32_t low;  
1091 - uint32_t high;  
1092 -  
1093 - DO_ABD(low, T0, T1, int8_t);  
1094 - DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);  
1095 - low |= tmp << 16;  
1096 - DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);  
1097 - DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);  
1098 - high |= tmp << 16;  
1099 - T0 = low;  
1100 - T1 = high;  
1101 - FORCE_RET();  
1102 -}  
1103 -  
1104 -NEON_OP(abdl_u32)  
1105 -{  
1106 - uint32_t low;  
1107 - uint32_t high;  
1108 -  
1109 - DO_ABD(low, T0, T1, uint16_t);  
1110 - DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);  
1111 - T0 = low;  
1112 - T1 = high;  
1113 - FORCE_RET();  
1114 -}  
1115 -  
1116 -NEON_OP(abdl_s32)  
1117 -{  
1118 - uint32_t low;  
1119 - uint32_t high;  
1120 -  
1121 - DO_ABD(low, T0, T1, int16_t);  
1122 - DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);  
1123 - T0 = low;  
1124 - T1 = high;  
1125 - FORCE_RET();  
1126 -}  
1127 -  
1128 -NEON_OP(abdl_u64)  
1129 -{  
1130 - DO_ABD(T0, T0, T1, uint32_t);  
1131 - T1 = 0;  
1132 -}  
1133 -  
1134 -NEON_OP(abdl_s64)  
1135 -{  
1136 - DO_ABD(T0, T0, T1, int32_t);  
1137 - T1 = 0;  
1138 -}  
1139 -#undef DO_ABD  
1140 -  
1141 -/* Widening multiple. Named type is the source type. */  
1142 -#define DO_MULL(dest, x, y, type1, type2) do { \  
1143 - type1 tmp_x = x; \  
1144 - type1 tmp_y = y; \  
1145 - dest = (type2)((type2)tmp_x * (type2)tmp_y); \  
1146 - } while(0)  
1147 -  
1148 -NEON_OP(mull_u8)  
1149 -{  
1150 - uint32_t tmp;  
1151 - uint32_t low;  
1152 - uint32_t high;  
1153 -  
1154 - DO_MULL(low, T0, T1, uint8_t, uint16_t);  
1155 - DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);  
1156 - low |= tmp << 16;  
1157 - DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);  
1158 - DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);  
1159 - high |= tmp << 16;  
1160 - T0 = low;  
1161 - T1 = high;  
1162 - FORCE_RET();  
1163 -}  
1164 -  
1165 -NEON_OP(mull_s8)  
1166 -{  
1167 - uint32_t tmp;  
1168 - uint32_t low;  
1169 - uint32_t high;  
1170 -  
1171 - DO_MULL(low, T0, T1, int8_t, uint16_t);  
1172 - DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);  
1173 - low |= tmp << 16;  
1174 - DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);  
1175 - DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);  
1176 - high |= tmp << 16;  
1177 - T0 = low;  
1178 - T1 = high;  
1179 - FORCE_RET();  
1180 -}  
1181 -  
1182 -NEON_OP(mull_u16)  
1183 -{  
1184 - uint32_t low;  
1185 - uint32_t high;  
1186 -  
1187 - DO_MULL(low, T0, T1, uint16_t, uint32_t);  
1188 - DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);  
1189 - T0 = low;  
1190 - T1 = high;  
1191 - FORCE_RET();  
1192 -}  
1193 -  
1194 -NEON_OP(mull_s16)  
1195 -{  
1196 - uint32_t low;  
1197 - uint32_t high;  
1198 -  
1199 - DO_MULL(low, T0, T1, int16_t, uint32_t);  
1200 - DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);  
1201 - T0 = low;  
1202 - T1 = high;  
1203 - FORCE_RET();  
1204 -}  
1205 -  
1206 -NEON_OP(addl_saturate_s32)  
1207 -{  
1208 - uint32_t tmp;  
1209 - uint32_t res;  
1210 -  
1211 - tmp = env->vfp.scratch[0];  
1212 - res = T0 + tmp;  
1213 - if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {  
1214 - env->QF = 1;  
1215 - T0 = (T0 >> 31) ^ 0x7fffffff;  
1216 - } else {  
1217 - T0 = res;  
1218 - }  
1219 - tmp = env->vfp.scratch[1];  
1220 - res = T1 + tmp;  
1221 - if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {  
1222 - env->QF = 1;  
1223 - T1 = (T1 >> 31) ^ 0x7fffffff;  
1224 - } else {  
1225 - T1 = res;  
1226 - }  
1227 - FORCE_RET();  
1228 -}  
1229 -  
1230 -NEON_OP(addl_saturate_s64)  
1231 -{  
1232 - uint64_t src1;  
1233 - uint64_t src2;  
1234 - uint64_t res;  
1235 -  
1236 - src1 = T0 + ((uint64_t)T1 << 32);  
1237 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1238 - res = src1 + src2;  
1239 - if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {  
1240 - env->QF = 1;  
1241 - T0 = ~(int64_t)src1 >> 63;  
1242 - T1 = T0 ^ 0x80000000;  
1243 - } else {  
1244 - T0 = res;  
1245 - T1 = res >> 32;  
1246 - }  
1247 - FORCE_RET();  
1248 -}  
1249 -  
1250 -NEON_OP(addl_saturate_u64)  
1251 -{  
1252 - uint64_t src1;  
1253 - uint64_t src2;  
1254 - uint64_t res;  
1255 -  
1256 - src1 = T0 + ((uint64_t)T1 << 32);  
1257 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1258 - res = src1 + src2;  
1259 - if (res < src1) {  
1260 - env->QF = 1;  
1261 - T0 = 0xffffffff;  
1262 - T1 = 0xffffffff;  
1263 - } else {  
1264 - T0 = res;  
1265 - T1 = res >> 32;  
1266 - }  
1267 - FORCE_RET();  
1268 -}  
1269 -  
1270 -NEON_OP(subl_saturate_s64)  
1271 -{  
1272 - uint64_t src1;  
1273 - uint64_t src2;  
1274 - uint64_t res;  
1275 -  
1276 - src1 = T0 + ((uint64_t)T1 << 32);  
1277 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1278 - res = src1 - src2;  
1279 - if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {  
1280 - env->QF = 1;  
1281 - T0 = ~(int64_t)src1 >> 63;  
1282 - T1 = T0 ^ 0x80000000;  
1283 - } else {  
1284 - T0 = res;  
1285 - T1 = res >> 32;  
1286 - }  
1287 - FORCE_RET();  
1288 -}  
1289 -  
1290 -NEON_OP(subl_saturate_u64)  
1291 -{  
1292 - uint64_t src1;  
1293 - uint64_t src2;  
1294 - uint64_t res;  
1295 -  
1296 - src1 = T0 + ((uint64_t)T1 << 32);  
1297 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1298 - if (src1 < src2) {  
1299 - env->QF = 1;  
1300 - T0 = 0;  
1301 - T1 = 0;  
1302 - } else {  
1303 - res = src1 - src2;  
1304 - T0 = res;  
1305 - T1 = res >> 32;  
1306 - }  
1307 - FORCE_RET();  
1308 -}  
1309 -  
1310 -NEON_OP(negl_u16)  
1311 -{  
1312 - uint32_t tmp;  
1313 - tmp = T0 >> 16;  
1314 - tmp = -tmp;  
1315 - T0 = (-T0 & 0xffff) | (tmp << 16);  
1316 - tmp = T1 >> 16;  
1317 - tmp = -tmp;  
1318 - T1 = (-T1 & 0xffff) | (tmp << 16);  
1319 - FORCE_RET();  
1320 -}  
1321 -  
1322 -NEON_OP(negl_u32)  
1323 -{  
1324 - T0 = -T0;  
1325 - T1 = -T1;  
1326 - FORCE_RET();  
1327 -}  
1328 -  
1329 -NEON_OP(negl_u64)  
1330 -{  
1331 - uint64_t val;  
1332 -  
1333 - val = T0 | ((uint64_t)T1 << 32);  
1334 - val = -val;  
1335 - T0 = val;  
1336 - T1 = val >> 32;  
1337 - FORCE_RET();  
1338 -}  
1339 -  
1340 -/* Scalar operations. */  
1341 -NEON_OP(dup_low16)  
1342 -{  
1343 - T0 = (T0 & 0xffff) | (T0 << 16);  
1344 - FORCE_RET();  
1345 -}  
1346 -  
1347 -NEON_OP(dup_high16)  
1348 -{  
1349 - T0 = (T0 >> 16) | (T0 & 0xffff0000);  
1350 - FORCE_RET();  
1351 -}  
1352 -  
1353 -/* Helper for VEXT */  
1354 -NEON_OP(extract)  
1355 -{  
1356 - int shift = PARAM1;  
1357 - T0 = (T0 >> shift) | (T1 << (32 - shift));  
1358 - FORCE_RET();  
1359 -}  
1360 -  
1361 -/* Pairwise add long. Named type is source type. */  
1362 -NEON_OP(paddl_s8)  
1363 -{  
1364 - int8_t src1;  
1365 - int8_t src2;  
1366 - uint16_t result;  
1367 - src1 = T0 >> 24;  
1368 - src2 = T0 >> 16;  
1369 - result = (uint16_t)src1 + src2;  
1370 - src1 = T0 >> 8;  
1371 - src2 = T0;  
1372 - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);  
1373 - FORCE_RET();  
1374 -}  
1375 -  
1376 -NEON_OP(paddl_u8)  
1377 -{  
1378 - uint8_t src1;  
1379 - uint8_t src2;  
1380 - uint16_t result;  
1381 - src1 = T0 >> 24;  
1382 - src2 = T0 >> 16;  
1383 - result = (uint16_t)src1 + src2;  
1384 - src1 = T0 >> 8;  
1385 - src2 = T0;  
1386 - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);  
1387 - FORCE_RET();  
1388 -}  
1389 -  
1390 -NEON_OP(paddl_s16)  
1391 -{  
1392 - T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);  
1393 - FORCE_RET();  
1394 -}  
1395 -  
1396 -NEON_OP(paddl_u16)  
1397 -{  
1398 - T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);  
1399 - FORCE_RET();  
1400 -}  
1401 -  
1402 -NEON_OP(paddl_s32)  
1403 -{  
1404 - int64_t tmp;  
1405 - tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;  
1406 - T0 = tmp;  
1407 - T1 = tmp >> 32;  
1408 - FORCE_RET();  
1409 -}  
1410 -  
1411 -NEON_OP(paddl_u32)  
1412 -{  
1413 - uint64_t tmp;  
1414 - tmp = (uint64_t)T0 + (uint64_t)T1;  
1415 - T0 = tmp;  
1416 - T1 = tmp >> 32;  
1417 - FORCE_RET();  
1418 -}  
1419 -  
1420 -/* Count Leading Sign/Zero Bits. */  
1421 -static inline int do_clz8(uint8_t x)  
1422 -{  
1423 - int n;  
1424 - for (n = 8; x; n--)  
1425 - x >>= 1;  
1426 - return n;  
1427 -}  
1428 -  
1429 -static inline int do_clz16(uint16_t x)  
1430 -{  
1431 - int n;  
1432 - for (n = 16; x; n--)  
1433 - x >>= 1;  
1434 - return n;  
1435 -}  
1436 -  
1437 -NEON_OP(clz_u8)  
1438 -{  
1439 - uint32_t result;  
1440 - uint32_t tmp;  
1441 -  
1442 - tmp = T0;  
1443 - result = do_clz8(tmp);  
1444 - result |= do_clz8(tmp >> 8) << 8;  
1445 - result |= do_clz8(tmp >> 16) << 16;  
1446 - result |= do_clz8(tmp >> 24) << 24;  
1447 - T0 = result;  
1448 - FORCE_RET();  
1449 -}  
1450 -  
1451 -NEON_OP(clz_u16)  
1452 -{  
1453 - uint32_t result;  
1454 - uint32_t tmp;  
1455 - tmp = T0;  
1456 - result = do_clz16(tmp);  
1457 - result |= do_clz16(tmp >> 16) << 16;  
1458 - T0 = result;  
1459 - FORCE_RET();  
1460 -}  
1461 -  
1462 -NEON_OP(cls_s8)  
1463 -{  
1464 - uint32_t result;  
1465 - int8_t tmp;  
1466 - tmp = T0;  
1467 - result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;  
1468 - tmp = T0 >> 8;  
1469 - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;  
1470 - tmp = T0 >> 16;  
1471 - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;  
1472 - tmp = T0 >> 24;  
1473 - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;  
1474 - T0 = result;  
1475 - FORCE_RET();  
1476 -}  
1477 -  
1478 -NEON_OP(cls_s16)  
1479 -{  
1480 - uint32_t result;  
1481 - int16_t tmp;  
1482 - tmp = T0;  
1483 - result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;  
1484 - tmp = T0 >> 16;  
1485 - result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;  
1486 - T0 = result;  
1487 - FORCE_RET();  
1488 -}  
1489 -  
1490 -NEON_OP(cls_s32)  
1491 -{  
1492 - int count;  
1493 - if ((int32_t)T0 < 0)  
1494 - T0 = ~T0;  
1495 - for (count = 32; T0 > 0; count--)  
1496 - T0 = T0 >> 1;  
1497 - T0 = count - 1;  
1498 - FORCE_RET();  
1499 -}  
1500 -  
1501 -/* Bit count. */  
1502 -NEON_OP(cnt_u8)  
1503 -{  
1504 - T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555);  
1505 - T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333);  
1506 - T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f);  
1507 - FORCE_RET();  
1508 -}  
1509 -  
1510 -/* Saturnating negation. */  
1511 -/* ??? Make these use NEON_VOP1 */  
1512 -#define DO_QABS8(x) do { \  
1513 - if (x == (int8_t)0x80) { \  
1514 - x = 0x7f; \  
1515 - env->QF = 1; \  
1516 - } else if (x < 0) { \  
1517 - x = -x; \  
1518 - }} while (0)  
1519 -NEON_OP(qabs_s8)  
1520 -{  
1521 - neon_s8 vec;  
1522 - NEON_UNPACK(neon_s8, vec, T0);  
1523 - DO_QABS8(vec.v1);  
1524 - DO_QABS8(vec.v2);  
1525 - DO_QABS8(vec.v3);  
1526 - DO_QABS8(vec.v4);  
1527 - NEON_PACK(neon_s8, T0, vec);  
1528 - FORCE_RET();  
1529 -}  
1530 -#undef DO_QABS8  
1531 -  
1532 -#define DO_QNEG8(x) do { \  
1533 - if (x == (int8_t)0x80) { \  
1534 - x = 0x7f; \  
1535 - env->QF = 1; \  
1536 - } else { \  
1537 - x = -x; \  
1538 - }} while (0)  
1539 -NEON_OP(qneg_s8)  
1540 -{  
1541 - neon_s8 vec;  
1542 - NEON_UNPACK(neon_s8, vec, T0);  
1543 - DO_QNEG8(vec.v1);  
1544 - DO_QNEG8(vec.v2);  
1545 - DO_QNEG8(vec.v3);  
1546 - DO_QNEG8(vec.v4);  
1547 - NEON_PACK(neon_s8, T0, vec);  
1548 - FORCE_RET();  
1549 -}  
1550 -#undef DO_QNEG8  
1551 -  
1552 -#define DO_QABS16(x) do { \  
1553 - if (x == (int16_t)0x8000) { \  
1554 - x = 0x7fff; \  
1555 - env->QF = 1; \  
1556 - } else if (x < 0) { \  
1557 - x = -x; \  
1558 - }} while (0)  
1559 -NEON_OP(qabs_s16)  
1560 -{  
1561 - neon_s16 vec;  
1562 - NEON_UNPACK(neon_s16, vec, T0);  
1563 - DO_QABS16(vec.v1);  
1564 - DO_QABS16(vec.v2);  
1565 - NEON_PACK(neon_s16, T0, vec);  
1566 - FORCE_RET();  
1567 -}  
1568 -#undef DO_QABS16  
1569 -  
1570 -#define DO_QNEG16(x) do { \  
1571 - if (x == (int16_t)0x8000) { \  
1572 - x = 0x7fff; \  
1573 - env->QF = 1; \  
1574 - } else { \  
1575 - x = -x; \  
1576 - }} while (0)  
1577 -NEON_OP(qneg_s16)  
1578 -{  
1579 - neon_s16 vec;  
1580 - NEON_UNPACK(neon_s16, vec, T0);  
1581 - DO_QNEG16(vec.v1);  
1582 - DO_QNEG16(vec.v2);  
1583 - NEON_PACK(neon_s16, T0, vec);  
1584 - FORCE_RET();  
1585 -}  
1586 -#undef DO_QNEG16  
1587 -  
1588 -NEON_OP(qabs_s32)  
1589 -{  
1590 - if (T0 == 0x80000000) {  
1591 - T0 = 0x7fffffff;  
1592 - env->QF = 1;  
1593 - } else if ((int32_t)T0 < 0) {  
1594 - T0 = -T0;  
1595 - }  
1596 - FORCE_RET();  
1597 -}  
1598 -  
1599 -NEON_OP(qneg_s32)  
1600 -{  
1601 - if (T0 == 0x80000000) {  
1602 - T0 = 0x7fffffff;  
1603 - env->QF = 1;  
1604 - } else {  
1605 - T0 = -T0;  
1606 - }  
1607 - FORCE_RET();  
1608 -}  
1609 -  
1610 -/* Unary opperations */  
1611 -#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src  
1612 -NEON_VOP1(abs_s8, neon_s8, 4)  
1613 -NEON_VOP1(abs_s16, neon_s16, 2)  
1614 -NEON_OP(abs_s32)  
1615 -{  
1616 - if ((int32_t)T0 < 0)  
1617 - T0 = -T0;  
1618 - FORCE_RET();  
1619 -}  
1620 -#undef NEON_FN  
1621 -  
1622 -/* Transpose. Argument order is rather strange to avoid special casing  
1623 - the tranlation code.  
1624 - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */  
1625 -NEON_OP(trn_u8)  
1626 -{  
1627 - uint32_t rd;  
1628 - uint32_t rm;  
1629 - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);  
1630 - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);  
1631 - T0 = rd;  
1632 - T1 = rm;  
1633 - FORCE_RET();  
1634 -}  
1635 -  
1636 -NEON_OP(trn_u16)  
1637 -{  
1638 - uint32_t rd;  
1639 - uint32_t rm;  
1640 - rd = (T0 << 16) | (T1 & 0xffff);  
1641 - rm = (T1 >> 16) | (T0 & 0xffff0000);  
1642 - T0 = rd;  
1643 - T1 = rm;  
1644 - FORCE_RET();  
1645 -}  
1646 -  
1647 -/* Worker routines for zip and unzip. */  
1648 -NEON_OP(unzip_u8)  
1649 -{  
1650 - uint32_t rd;  
1651 - uint32_t rm;  
1652 - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)  
1653 - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);  
1654 - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)  
1655 - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);  
1656 - T0 = rd;  
1657 - T1 = rm;  
1658 - FORCE_RET();  
1659 -}  
1660 -  
1661 -NEON_OP(zip_u8)  
1662 -{  
1663 - uint32_t rd;  
1664 - uint32_t rm;  
1665 - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)  
1666 - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);  
1667 - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)  
1668 - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);  
1669 - T0 = rd;  
1670 - T1 = rm;  
1671 - FORCE_RET();  
1672 -}  
1673 -  
1674 -NEON_OP(zip_u16)  
1675 -{  
1676 - uint32_t tmp;  
1677 -  
1678 - tmp = (T0 & 0xffff) | (T1 << 16);  
1679 - T1 = (T1 & 0xffff0000) | (T0 >> 16);  
1680 - T0 = tmp;  
1681 - FORCE_RET();  
1682 -}  
1683 -  
1684 -NEON_OP(dup_u8)  
1685 -{  
1686 - T0 = (T0 >> PARAM1) & 0xff;  
1687 - T0 |= T0 << 8;  
1688 - T0 |= T0 << 16;  
1689 - FORCE_RET();  
1690 -}  
target-arm/translate.c
@@ -77,6 +77,9 @@ extern FILE *logfile; @@ -77,6 +77,9 @@ extern FILE *logfile;
77 extern int loglevel; 77 extern int loglevel;
78 78
79 static TCGv cpu_env; 79 static TCGv cpu_env;
  80 +/* We reuse the same 64-bit temporaries for efficiency. */
  81 +static TCGv cpu_V0, cpu_V1;
  82 +
80 /* FIXME: These should be removed. */ 83 /* FIXME: These should be removed. */
81 static TCGv cpu_T[2]; 84 static TCGv cpu_T[2];
82 static TCGv cpu_F0s, cpu_F1s, cpu_F0d, cpu_F1d; 85 static TCGv cpu_F0s, cpu_F1s, cpu_F0d, cpu_F1d;
@@ -469,6 +472,9 @@ static inline void gen_op_bicl_T0_T1(void) @@ -469,6 +472,9 @@ static inline void gen_op_bicl_T0_T1(void)
469 } 472 }
470 473
471 /* FIXME: Implement this natively. */ 474 /* FIXME: Implement this natively. */
  475 +#define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1)
  476 +
  477 +/* FIXME: Implement this natively. */
472 static void tcg_gen_rori_i32(TCGv t0, TCGv t1, int i) 478 static void tcg_gen_rori_i32(TCGv t0, TCGv t1, int i)
473 { 479 {
474 TCGv tmp; 480 TCGv tmp;
@@ -1166,8 +1172,13 @@ neon_reg_offset (int reg, int n) @@ -1166,8 +1172,13 @@ neon_reg_offset (int reg, int n)
1166 return vfp_reg_offset(0, sreg); 1172 return vfp_reg_offset(0, sreg);
1167 } 1173 }
1168 1174
1169 -#define NEON_GET_REG(T, reg, n) gen_op_neon_getreg_##T(neon_reg_offset(reg, n))  
1170 -#define NEON_SET_REG(T, reg, n) gen_op_neon_setreg_##T(neon_reg_offset(reg, n)) 1175 +/* FIXME: Remove these. */
  1176 +#define neon_T0 cpu_T[0]
  1177 +#define neon_T1 cpu_T[1]
  1178 +#define NEON_GET_REG(T, reg, n) \
  1179 + tcg_gen_ld_i32(neon_##T, cpu_env, neon_reg_offset(reg, n))
  1180 +#define NEON_SET_REG(T, reg, n) \
  1181 + tcg_gen_st_i32(neon_##T, cpu_env, neon_reg_offset(reg, n))
1171 1182
1172 static TCGv neon_load_reg(int reg, int pass) 1183 static TCGv neon_load_reg(int reg, int pass)
1173 { 1184 {
@@ -1182,6 +1193,16 @@ static void neon_store_reg(int reg, int pass, TCGv var) @@ -1182,6 +1193,16 @@ static void neon_store_reg(int reg, int pass, TCGv var)
1182 dead_tmp(var); 1193 dead_tmp(var);
1183 } 1194 }
1184 1195
  1196 +static inline void neon_load_reg64(TCGv var, int reg)
  1197 +{
  1198 + tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(1, reg));
  1199 +}
  1200 +
  1201 +static inline void neon_store_reg64(TCGv var, int reg)
  1202 +{
  1203 + tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(1, reg));
  1204 +}
  1205 +
1185 #define tcg_gen_ld_f32 tcg_gen_ld_i32 1206 #define tcg_gen_ld_f32 tcg_gen_ld_i32
1186 #define tcg_gen_ld_f64 tcg_gen_ld_i64 1207 #define tcg_gen_ld_f64 tcg_gen_ld_i64
1187 #define tcg_gen_st_f32 tcg_gen_st_i32 1208 #define tcg_gen_st_f32 tcg_gen_st_i32
@@ -2418,6 +2439,37 @@ vfp_enabled(CPUState * env) @@ -2418,6 +2439,37 @@ vfp_enabled(CPUState * env)
2418 return ((env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)) != 0); 2439 return ((env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)) != 0);
2419 } 2440 }
2420 2441
  2442 +static void gen_neon_dup_u8(TCGv var, int shift)
  2443 +{
  2444 + TCGv tmp = new_tmp();
  2445 + if (shift)
  2446 + tcg_gen_shri_i32(var, var, shift);
  2447 + tcg_gen_andi_i32(var, var, 0xff);
  2448 + tcg_gen_shli_i32(tmp, var, 8);
  2449 + tcg_gen_or_i32(var, var, tmp);
  2450 + tcg_gen_shli_i32(tmp, var, 16);
  2451 + tcg_gen_or_i32(var, var, tmp);
  2452 + dead_tmp(tmp);
  2453 +}
  2454 +
  2455 +static void gen_neon_dup_low16(TCGv var)
  2456 +{
  2457 + TCGv tmp = new_tmp();
  2458 + tcg_gen_andi_i32(var, var, 0xffff);
  2459 + tcg_gen_shli_i32(tmp, var, 16);
  2460 + tcg_gen_or_i32(var, var, tmp);
  2461 + dead_tmp(tmp);
  2462 +}
  2463 +
  2464 +static void gen_neon_dup_high16(TCGv var)
  2465 +{
  2466 + TCGv tmp = new_tmp();
  2467 + tcg_gen_andi_i32(var, var, 0xffff0000);
  2468 + tcg_gen_shri_i32(tmp, var, 16);
  2469 + tcg_gen_or_i32(var, var, tmp);
  2470 + dead_tmp(tmp);
  2471 +}
  2472 +
2421 /* Disassemble a VFP instruction. Returns nonzero if an error occured 2473 /* Disassemble a VFP instruction. Returns nonzero if an error occured
2422 (ie. an undefined instruction). */ 2474 (ie. an undefined instruction). */
2423 static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn) 2475 static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
@@ -2425,6 +2477,7 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -2425,6 +2477,7 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
2425 uint32_t rd, rn, rm, op, i, n, offset, delta_d, delta_m, bank_mask; 2477 uint32_t rd, rn, rm, op, i, n, offset, delta_d, delta_m, bank_mask;
2426 int dp, veclen; 2478 int dp, veclen;
2427 TCGv tmp; 2479 TCGv tmp;
  2480 + TCGv tmp2;
2428 2481
2429 if (!arm_feature(env, ARM_FEATURE_VFP)) 2482 if (!arm_feature(env, ARM_FEATURE_VFP))
2430 return 1; 2483 return 1;
@@ -2468,66 +2521,66 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -2468,66 +2521,66 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
2468 } 2521 }
2469 if (insn & ARM_CP_RW_BIT) { 2522 if (insn & ARM_CP_RW_BIT) {
2470 /* vfp->arm */ 2523 /* vfp->arm */
  2524 + tmp = neon_load_reg(rn, pass);
2471 switch (size) { 2525 switch (size) {
2472 case 0: 2526 case 0:
2473 - NEON_GET_REG(T1, rn, pass);  
2474 if (offset) 2527 if (offset)
2475 - gen_op_shrl_T1_im(offset); 2528 + tcg_gen_shri_i32(tmp, tmp, offset);
2476 if (insn & (1 << 23)) 2529 if (insn & (1 << 23))
2477 - gen_uxtb(cpu_T[1]); 2530 + gen_uxtb(tmp);
2478 else 2531 else
2479 - gen_sxtb(cpu_T[1]); 2532 + gen_sxtb(tmp);
2480 break; 2533 break;
2481 case 1: 2534 case 1:
2482 - NEON_GET_REG(T1, rn, pass);  
2483 if (insn & (1 << 23)) { 2535 if (insn & (1 << 23)) {
2484 if (offset) { 2536 if (offset) {
2485 - gen_op_shrl_T1_im(16); 2537 + tcg_gen_shri_i32(tmp, tmp, 16);
2486 } else { 2538 } else {
2487 - gen_uxth(cpu_T[1]); 2539 + gen_uxth(tmp);
2488 } 2540 }
2489 } else { 2541 } else {
2490 if (offset) { 2542 if (offset) {
2491 - gen_op_sarl_T1_im(16); 2543 + tcg_gen_sari_i32(tmp, tmp, 16);
2492 } else { 2544 } else {
2493 - gen_sxth(cpu_T[1]); 2545 + gen_sxth(tmp);
2494 } 2546 }
2495 } 2547 }
2496 break; 2548 break;
2497 case 2: 2549 case 2:
2498 - NEON_GET_REG(T1, rn, pass);  
2499 break; 2550 break;
2500 } 2551 }
2501 - gen_movl_reg_T1(s, rd); 2552 + store_reg(s, rd, tmp);
2502 } else { 2553 } else {
2503 /* arm->vfp */ 2554 /* arm->vfp */
2504 - gen_movl_T0_reg(s, rd); 2555 + tmp = load_reg(s, rd);
2505 if (insn & (1 << 23)) { 2556 if (insn & (1 << 23)) {
2506 /* VDUP */ 2557 /* VDUP */
2507 if (size == 0) { 2558 if (size == 0) {
2508 - gen_op_neon_dup_u8(0); 2559 + gen_neon_dup_u8(tmp, 0);
2509 } else if (size == 1) { 2560 } else if (size == 1) {
2510 - gen_op_neon_dup_low16(); 2561 + gen_neon_dup_low16(tmp);
2511 } 2562 }
2512 - NEON_SET_REG(T0, rn, 0);  
2513 - NEON_SET_REG(T0, rn, 1); 2563 + tmp2 = new_tmp();
  2564 + tcg_gen_mov_i32(tmp2, tmp);
  2565 + neon_store_reg(rn, 0, tmp2);
  2566 + neon_store_reg(rn, 0, tmp);
2514 } else { 2567 } else {
2515 /* VMOV */ 2568 /* VMOV */
2516 switch (size) { 2569 switch (size) {
2517 case 0: 2570 case 0:
2518 - tmp = neon_load_reg(rn, pass);  
2519 - gen_bfi(tmp, tmp, cpu_T[0], offset, 0xff);  
2520 - neon_store_reg(rn, pass, tmp); 2571 + tmp2 = neon_load_reg(rn, pass);
  2572 + gen_bfi(tmp, tmp2, tmp, offset, 0xff);
  2573 + dead_tmp(tmp2);
2521 break; 2574 break;
2522 case 1: 2575 case 1:
2523 - tmp = neon_load_reg(rn, pass);  
2524 - gen_bfi(tmp, tmp, cpu_T[0], offset, 0xffff);  
2525 - neon_store_reg(rn, pass, tmp); 2576 + tmp2 = neon_load_reg(rn, pass);
  2577 + gen_bfi(tmp, tmp2, tmp, offset, 0xffff);
  2578 + dead_tmp(tmp2);
2526 break; 2579 break;
2527 case 2: 2580 case 2:
2528 - NEON_SET_REG(T0, rn, pass);  
2529 break; 2581 break;
2530 } 2582 }
  2583 + neon_store_reg(rn, pass, tmp);
2531 } 2584 }
2532 } 2585 }
2533 } else { /* !dp */ 2586 } else { /* !dp */
@@ -3210,179 +3263,90 @@ static void gen_nop_hint(DisasContext *s, int val) @@ -3210,179 +3263,90 @@ static void gen_nop_hint(DisasContext *s, int val)
3210 } 3263 }
3211 } 3264 }
3212 3265
3213 -/* Neon shift by constant. The actual ops are the same as used for variable  
3214 - shifts. [OP][U][SIZE] */  
3215 -static GenOpFunc *gen_neon_shift_im[8][2][4] = {  
3216 - { /* 0 */ /* VSHR */  
3217 - {  
3218 - gen_op_neon_shl_u8,  
3219 - gen_op_neon_shl_u16,  
3220 - gen_op_neon_shl_u32,  
3221 - gen_op_neon_shl_u64  
3222 - }, {  
3223 - gen_op_neon_shl_s8,  
3224 - gen_op_neon_shl_s16,  
3225 - gen_op_neon_shl_s32,  
3226 - gen_op_neon_shl_s64  
3227 - }  
3228 - }, { /* 1 */ /* VSRA */  
3229 - {  
3230 - gen_op_neon_shl_u8,  
3231 - gen_op_neon_shl_u16,  
3232 - gen_op_neon_shl_u32,  
3233 - gen_op_neon_shl_u64  
3234 - }, {  
3235 - gen_op_neon_shl_s8,  
3236 - gen_op_neon_shl_s16,  
3237 - gen_op_neon_shl_s32,  
3238 - gen_op_neon_shl_s64  
3239 - }  
3240 - }, { /* 2 */ /* VRSHR */  
3241 - {  
3242 - gen_op_neon_rshl_u8,  
3243 - gen_op_neon_rshl_u16,  
3244 - gen_op_neon_rshl_u32,  
3245 - gen_op_neon_rshl_u64  
3246 - }, {  
3247 - gen_op_neon_rshl_s8,  
3248 - gen_op_neon_rshl_s16,  
3249 - gen_op_neon_rshl_s32,  
3250 - gen_op_neon_rshl_s64  
3251 - }  
3252 - }, { /* 3 */ /* VRSRA */  
3253 - {  
3254 - gen_op_neon_rshl_u8,  
3255 - gen_op_neon_rshl_u16,  
3256 - gen_op_neon_rshl_u32,  
3257 - gen_op_neon_rshl_u64  
3258 - }, {  
3259 - gen_op_neon_rshl_s8,  
3260 - gen_op_neon_rshl_s16,  
3261 - gen_op_neon_rshl_s32,  
3262 - gen_op_neon_rshl_s64  
3263 - }  
3264 - }, { /* 4 */  
3265 - {  
3266 - NULL, NULL, NULL, NULL  
3267 - }, { /* VSRI */  
3268 - gen_op_neon_shl_u8,  
3269 - gen_op_neon_shl_u16,  
3270 - gen_op_neon_shl_u32,  
3271 - gen_op_neon_shl_u64,  
3272 - }  
3273 - }, { /* 5 */  
3274 - { /* VSHL */  
3275 - gen_op_neon_shl_u8,  
3276 - gen_op_neon_shl_u16,  
3277 - gen_op_neon_shl_u32,  
3278 - gen_op_neon_shl_u64,  
3279 - }, { /* VSLI */  
3280 - gen_op_neon_shl_u8,  
3281 - gen_op_neon_shl_u16,  
3282 - gen_op_neon_shl_u32,  
3283 - gen_op_neon_shl_u64,  
3284 - }  
3285 - }, { /* 6 */ /* VQSHL */  
3286 - {  
3287 - gen_op_neon_qshl_u8,  
3288 - gen_op_neon_qshl_u16,  
3289 - gen_op_neon_qshl_u32,  
3290 - gen_op_neon_qshl_u64  
3291 - }, {  
3292 - gen_op_neon_qshl_s8,  
3293 - gen_op_neon_qshl_s16,  
3294 - gen_op_neon_qshl_s32,  
3295 - gen_op_neon_qshl_s64  
3296 - }  
3297 - }, { /* 7 */ /* VQSHLU */  
3298 - {  
3299 - gen_op_neon_qshl_u8,  
3300 - gen_op_neon_qshl_u16,  
3301 - gen_op_neon_qshl_u32,  
3302 - gen_op_neon_qshl_u64  
3303 - }, {  
3304 - gen_op_neon_qshl_u8,  
3305 - gen_op_neon_qshl_u16,  
3306 - gen_op_neon_qshl_u32,  
3307 - gen_op_neon_qshl_u64  
3308 - }  
3309 - }  
3310 -}; 3266 +/* These macros help make the code more readable when migrating from the
  3267 + old dyngen helpers. They should probably be removed when
  3268 + T0/T1 are removed. */
  3269 +#define CPU_T001 cpu_T[0], cpu_T[0], cpu_T[1]
  3270 +#define CPU_T0E01 cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]
3311 3271
3312 -/* [R][U][size - 1] */  
3313 -static GenOpFunc *gen_neon_shift_im_narrow[2][2][3] = {  
3314 - {  
3315 - {  
3316 - gen_op_neon_shl_u16,  
3317 - gen_op_neon_shl_u32,  
3318 - gen_op_neon_shl_u64  
3319 - }, {  
3320 - gen_op_neon_shl_s16,  
3321 - gen_op_neon_shl_s32,  
3322 - gen_op_neon_shl_s64  
3323 - }  
3324 - }, {  
3325 - {  
3326 - gen_op_neon_rshl_u16,  
3327 - gen_op_neon_rshl_u32,  
3328 - gen_op_neon_rshl_u64  
3329 - }, {  
3330 - gen_op_neon_rshl_s16,  
3331 - gen_op_neon_rshl_s32,  
3332 - gen_op_neon_rshl_s64  
3333 - }  
3334 - }  
3335 -};  
3336 -  
3337 -static inline void  
3338 -gen_op_neon_narrow_u32 ()  
3339 -{  
3340 - /* No-op. */  
3341 -}  
3342 -  
3343 -static GenOpFunc *gen_neon_narrow[3] = {  
3344 - gen_op_neon_narrow_u8,  
3345 - gen_op_neon_narrow_u16,  
3346 - gen_op_neon_narrow_u32  
3347 -};  
3348 -  
3349 -static GenOpFunc *gen_neon_narrow_satu[3] = {  
3350 - gen_op_neon_narrow_sat_u8,  
3351 - gen_op_neon_narrow_sat_u16,  
3352 - gen_op_neon_narrow_sat_u32  
3353 -};  
3354 -  
3355 -static GenOpFunc *gen_neon_narrow_sats[3] = {  
3356 - gen_op_neon_narrow_sat_s8,  
3357 - gen_op_neon_narrow_sat_s16,  
3358 - gen_op_neon_narrow_sat_s32  
3359 -}; 3272 +#define CPU_V001 cpu_V0, cpu_V0, cpu_V1
3360 3273
3361 static inline int gen_neon_add(int size) 3274 static inline int gen_neon_add(int size)
3362 { 3275 {
3363 switch (size) { 3276 switch (size) {
3364 - case 0: gen_op_neon_add_u8(); break;  
3365 - case 1: gen_op_neon_add_u16(); break; 3277 + case 0: gen_helper_neon_add_u8(CPU_T001); break;
  3278 + case 1: gen_helper_neon_add_u16(CPU_T001); break;
3366 case 2: gen_op_addl_T0_T1(); break; 3279 case 2: gen_op_addl_T0_T1(); break;
3367 default: return 1; 3280 default: return 1;
3368 } 3281 }
3369 return 0; 3282 return 0;
3370 } 3283 }
3371 3284
3372 -/* 32-bit pairwise ops end up the same as the elementsise versions. */  
3373 -#define gen_op_neon_pmax_s32 gen_op_neon_max_s32  
3374 -#define gen_op_neon_pmax_u32 gen_op_neon_max_u32  
3375 -#define gen_op_neon_pmin_s32 gen_op_neon_min_s32  
3376 -#define gen_op_neon_pmin_u32 gen_op_neon_min_u32 3285 +static inline void gen_neon_rsb(int size)
  3286 +{
  3287 + switch (size) {
  3288 + case 0: gen_helper_neon_sub_u8(cpu_T[0], cpu_T[1], cpu_T[0]); break;
  3289 + case 1: gen_helper_neon_sub_u16(cpu_T[0], cpu_T[1], cpu_T[0]); break;
  3290 + case 2: gen_op_rsbl_T0_T1(); break;
  3291 + default: return;
  3292 + }
  3293 +}
  3294 +
  3295 +/* 32-bit pairwise ops end up the same as the elementwise versions. */
  3296 +#define gen_helper_neon_pmax_s32 gen_helper_neon_max_s32
  3297 +#define gen_helper_neon_pmax_u32 gen_helper_neon_max_u32
  3298 +#define gen_helper_neon_pmin_s32 gen_helper_neon_min_s32
  3299 +#define gen_helper_neon_pmin_u32 gen_helper_neon_min_u32
  3300 +
  3301 +/* FIXME: This is wrong. They set the wrong overflow bit. */
  3302 +#define gen_helper_neon_qadd_s32(a, e, b, c) gen_helper_add_saturate(a, b, c)
  3303 +#define gen_helper_neon_qadd_u32(a, e, b, c) gen_helper_add_usaturate(a, b, c)
  3304 +#define gen_helper_neon_qsub_s32(a, e, b, c) gen_helper_sub_saturate(a, b, c)
  3305 +#define gen_helper_neon_qsub_u32(a, e, b, c) gen_helper_sub_usaturate(a, b, c)
  3306 +
  3307 +#define GEN_NEON_INTEGER_OP_ENV(name) do { \
  3308 + switch ((size << 1) | u) { \
  3309 + case 0: \
  3310 + gen_helper_neon_##name##_s8(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3311 + break; \
  3312 + case 1: \
  3313 + gen_helper_neon_##name##_u8(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3314 + break; \
  3315 + case 2: \
  3316 + gen_helper_neon_##name##_s16(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3317 + break; \
  3318 + case 3: \
  3319 + gen_helper_neon_##name##_u16(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3320 + break; \
  3321 + case 4: \
  3322 + gen_helper_neon_##name##_s32(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3323 + break; \
  3324 + case 5: \
  3325 + gen_helper_neon_##name##_u32(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]); \
  3326 + break; \
  3327 + default: return 1; \
  3328 + }} while (0)
3377 3329
3378 #define GEN_NEON_INTEGER_OP(name) do { \ 3330 #define GEN_NEON_INTEGER_OP(name) do { \
3379 switch ((size << 1) | u) { \ 3331 switch ((size << 1) | u) { \
3380 - case 0: gen_op_neon_##name##_s8(); break; \  
3381 - case 1: gen_op_neon_##name##_u8(); break; \  
3382 - case 2: gen_op_neon_##name##_s16(); break; \  
3383 - case 3: gen_op_neon_##name##_u16(); break; \  
3384 - case 4: gen_op_neon_##name##_s32(); break; \  
3385 - case 5: gen_op_neon_##name##_u32(); break; \ 3332 + case 0: \
  3333 + gen_helper_neon_##name##_s8(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3334 + break; \
  3335 + case 1: \
  3336 + gen_helper_neon_##name##_u8(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3337 + break; \
  3338 + case 2: \
  3339 + gen_helper_neon_##name##_s16(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3340 + break; \
  3341 + case 3: \
  3342 + gen_helper_neon_##name##_u16(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3343 + break; \
  3344 + case 4: \
  3345 + gen_helper_neon_##name##_s32(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3346 + break; \
  3347 + case 5: \
  3348 + gen_helper_neon_##name##_u32(cpu_T[0], cpu_T[0], cpu_T[1]); \
  3349 + break; \
3386 default: return 1; \ 3350 default: return 1; \
3387 }} while (0) 3351 }} while (0)
3388 3352
@@ -3392,7 +3356,7 @@ gen_neon_movl_scratch_T0(int scratch) @@ -3392,7 +3356,7 @@ gen_neon_movl_scratch_T0(int scratch)
3392 uint32_t offset; 3356 uint32_t offset;
3393 3357
3394 offset = offsetof(CPUARMState, vfp.scratch[scratch]); 3358 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3395 - gen_op_neon_setreg_T0(offset); 3359 + tcg_gen_st_i32(cpu_T[0], cpu_env, offset);
3396 } 3360 }
3397 3361
3398 static inline void 3362 static inline void
@@ -3401,7 +3365,7 @@ gen_neon_movl_scratch_T1(int scratch) @@ -3401,7 +3365,7 @@ gen_neon_movl_scratch_T1(int scratch)
3401 uint32_t offset; 3365 uint32_t offset;
3402 3366
3403 offset = offsetof(CPUARMState, vfp.scratch[scratch]); 3367 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3404 - gen_op_neon_setreg_T1(offset); 3368 + tcg_gen_st_i32(cpu_T[1], cpu_env, offset);
3405 } 3369 }
3406 3370
3407 static inline void 3371 static inline void
@@ -3410,7 +3374,7 @@ gen_neon_movl_T0_scratch(int scratch) @@ -3410,7 +3374,7 @@ gen_neon_movl_T0_scratch(int scratch)
3410 uint32_t offset; 3374 uint32_t offset;
3411 3375
3412 offset = offsetof(CPUARMState, vfp.scratch[scratch]); 3376 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3413 - gen_op_neon_getreg_T0(offset); 3377 + tcg_gen_ld_i32(cpu_T[0], cpu_env, offset);
3414 } 3378 }
3415 3379
3416 static inline void 3380 static inline void
@@ -3419,12 +3383,7 @@ gen_neon_movl_T1_scratch(int scratch) @@ -3419,12 +3383,7 @@ gen_neon_movl_T1_scratch(int scratch)
3419 uint32_t offset; 3383 uint32_t offset;
3420 3384
3421 offset = offsetof(CPUARMState, vfp.scratch[scratch]); 3385 offset = offsetof(CPUARMState, vfp.scratch[scratch]);
3422 - gen_op_neon_getreg_T1(offset);  
3423 -}  
3424 -  
3425 -static inline void gen_op_neon_widen_u32(void)  
3426 -{  
3427 - gen_op_movl_T1_im(0); 3386 + tcg_gen_ld_i32(cpu_T[1], cpu_env, offset);
3428 } 3387 }
3429 3388
3430 static inline void gen_neon_get_scalar(int size, int reg) 3389 static inline void gen_neon_get_scalar(int size, int reg)
@@ -3434,9 +3393,9 @@ static inline void gen_neon_get_scalar(int size, int reg) @@ -3434,9 +3393,9 @@ static inline void gen_neon_get_scalar(int size, int reg)
3434 } else { 3393 } else {
3435 NEON_GET_REG(T0, reg >> 2, (reg >> 1) & 1); 3394 NEON_GET_REG(T0, reg >> 2, (reg >> 1) & 1);
3436 if (reg & 1) 3395 if (reg & 1)
3437 - gen_op_neon_dup_low16(); 3396 + gen_neon_dup_low16(cpu_T[0]);
3438 else 3397 else
3439 - gen_op_neon_dup_high16(); 3398 + gen_neon_dup_high16(cpu_T[0]);
3440 } 3399 }
3441 } 3400 }
3442 3401
@@ -3448,8 +3407,8 @@ static void gen_neon_unzip(int reg, int q, int tmp, int size) @@ -3448,8 +3407,8 @@ static void gen_neon_unzip(int reg, int q, int tmp, int size)
3448 NEON_GET_REG(T0, reg, n); 3407 NEON_GET_REG(T0, reg, n);
3449 NEON_GET_REG(T0, reg, n + n); 3408 NEON_GET_REG(T0, reg, n + n);
3450 switch (size) { 3409 switch (size) {
3451 - case 0: gen_op_neon_unzip_u8(); break;  
3452 - case 1: gen_op_neon_zip_u16(); break; /* zip and unzip are the same. */ 3410 + case 0: gen_helper_neon_unzip_u8(); break;
  3411 + case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */
3453 case 2: /* no-op */; break; 3412 case 2: /* no-op */; break;
3454 default: abort(); 3413 default: abort();
3455 } 3414 }
@@ -3522,13 +3481,9 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3522,13 +3481,9 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
3522 if (size == 2) { 3481 if (size == 2) {
3523 if (load) { 3482 if (load) {
3524 tmp = gen_ld32(cpu_T[1], IS_USER(s)); 3483 tmp = gen_ld32(cpu_T[1], IS_USER(s));
3525 - tcg_gen_mov_i32(cpu_T[0], tmp);  
3526 - dead_tmp(tmp);  
3527 - NEON_SET_REG(T0, rd, pass); 3484 + neon_store_reg(rd, pass, tmp);
3528 } else { 3485 } else {
3529 - NEON_GET_REG(T0, rd, pass);  
3530 - tmp = new_tmp();  
3531 - tcg_gen_mov_i32(tmp, cpu_T[0]); 3486 + tmp = neon_load_reg(rd, pass);
3532 gen_st32(tmp, cpu_T[1], IS_USER(s)); 3487 gen_st32(tmp, cpu_T[1], IS_USER(s));
3533 } 3488 }
3534 gen_op_addl_T1_im(stride); 3489 gen_op_addl_T1_im(stride);
@@ -3596,27 +3551,23 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3596,27 +3551,23 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn)
3596 switch (size) { 3551 switch (size) {
3597 case 0: 3552 case 0:
3598 tmp = gen_ld8u(cpu_T[1], IS_USER(s)); 3553 tmp = gen_ld8u(cpu_T[1], IS_USER(s));
3599 - tcg_gen_mov_i32(cpu_T[0], tmp);  
3600 - dead_tmp(tmp);  
3601 - gen_op_neon_dup_u8(0); 3554 + gen_neon_dup_u8(tmp, 0);
3602 break; 3555 break;
3603 case 1: 3556 case 1:
3604 tmp = gen_ld16u(cpu_T[1], IS_USER(s)); 3557 tmp = gen_ld16u(cpu_T[1], IS_USER(s));
3605 - tcg_gen_mov_i32(cpu_T[0], tmp);  
3606 - dead_tmp(tmp);  
3607 - gen_op_neon_dup_low16(); 3558 + gen_neon_dup_low16(tmp);
3608 break; 3559 break;
3609 case 2: 3560 case 2:
3610 tmp = gen_ld32(cpu_T[0], IS_USER(s)); 3561 tmp = gen_ld32(cpu_T[0], IS_USER(s));
3611 - tcg_gen_mov_i32(cpu_T[0], tmp);  
3612 - dead_tmp(tmp);  
3613 break; 3562 break;
3614 case 3: 3563 case 3:
3615 return 1; 3564 return 1;
3616 } 3565 }
3617 gen_op_addl_T1_im(1 << size); 3566 gen_op_addl_T1_im(1 << size);
3618 - NEON_SET_REG(T0, rd, 0);  
3619 - NEON_SET_REG(T0, rd, 1); 3567 + tmp2 = new_tmp();
  3568 + tcg_gen_mov_i32(tmp2, tmp);
  3569 + neon_store_reg(rd, 0, tmp2);
  3570 + neon_store_reg(rd, 0, tmp);
3620 rd += stride; 3571 rd += stride;
3621 } 3572 }
3622 stride = (1 << size) * nregs; 3573 stride = (1 << size) * nregs;
@@ -3707,12 +3658,158 @@ static void gen_neon_bsl(TCGv dest, TCGv t, TCGv f, TCGv c) @@ -3707,12 +3658,158 @@ static void gen_neon_bsl(TCGv dest, TCGv t, TCGv f, TCGv c)
3707 tcg_gen_or_i32(dest, t, f); 3658 tcg_gen_or_i32(dest, t, f);
3708 } 3659 }
3709 3660
  3661 +static inline void gen_neon_narrow(int size, TCGv dest, TCGv src)
  3662 +{
  3663 + switch (size) {
  3664 + case 0: gen_helper_neon_narrow_u8(dest, src); break;
  3665 + case 1: gen_helper_neon_narrow_u16(dest, src); break;
  3666 + case 2: tcg_gen_trunc_i64_i32(dest, src); break;
  3667 + default: abort();
  3668 + }
  3669 +}
  3670 +
  3671 +static inline void gen_neon_narrow_sats(int size, TCGv dest, TCGv src)
  3672 +{
  3673 + switch (size) {
  3674 + case 0: gen_helper_neon_narrow_sat_s8(dest, cpu_env, src); break;
  3675 + case 1: gen_helper_neon_narrow_sat_s16(dest, cpu_env, src); break;
  3676 + case 2: gen_helper_neon_narrow_sat_s32(dest, cpu_env, src); break;
  3677 + default: abort();
  3678 + }
  3679 +}
  3680 +
  3681 +static inline void gen_neon_narrow_satu(int size, TCGv dest, TCGv src)
  3682 +{
  3683 + switch (size) {
  3684 + case 0: gen_helper_neon_narrow_sat_u8(dest, cpu_env, src); break;
  3685 + case 1: gen_helper_neon_narrow_sat_u16(dest, cpu_env, src); break;
  3686 + case 2: gen_helper_neon_narrow_sat_u32(dest, cpu_env, src); break;
  3687 + default: abort();
  3688 + }
  3689 +}
  3690 +
  3691 +static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift,
  3692 + int q, int u)
  3693 +{
  3694 + if (q) {
  3695 + if (u) {
  3696 + switch (size) {
  3697 + case 1: gen_helper_neon_rshl_u16(var, var, shift); break;
  3698 + case 2: gen_helper_neon_rshl_u32(var, var, shift); break;
  3699 + default: abort();
  3700 + }
  3701 + } else {
  3702 + switch (size) {
  3703 + case 1: gen_helper_neon_rshl_s16(var, var, shift); break;
  3704 + case 2: gen_helper_neon_rshl_s32(var, var, shift); break;
  3705 + default: abort();
  3706 + }
  3707 + }
  3708 + } else {
  3709 + if (u) {
  3710 + switch (size) {
  3711 + case 1: gen_helper_neon_rshl_u16(var, var, shift); break;
  3712 + case 2: gen_helper_neon_rshl_u32(var, var, shift); break;
  3713 + default: abort();
  3714 + }
  3715 + } else {
  3716 + switch (size) {
  3717 + case 1: gen_helper_neon_shl_s16(var, var, shift); break;
  3718 + case 2: gen_helper_neon_shl_s32(var, var, shift); break;
  3719 + default: abort();
  3720 + }
  3721 + }
  3722 + }
  3723 +}
  3724 +
  3725 +static inline void gen_neon_widen(TCGv dest, TCGv src, int size, int u)
  3726 +{
  3727 + if (u) {
  3728 + switch (size) {
  3729 + case 0: gen_helper_neon_widen_u8(dest, src); break;
  3730 + case 1: gen_helper_neon_widen_u16(dest, src); break;
  3731 + case 2: tcg_gen_extu_i32_i64(dest, src); break;
  3732 + default: abort();
  3733 + }
  3734 + } else {
  3735 + switch (size) {
  3736 + case 0: gen_helper_neon_widen_s8(dest, src); break;
  3737 + case 1: gen_helper_neon_widen_s16(dest, src); break;
  3738 + case 2: tcg_gen_ext_i32_i64(dest, src); break;
  3739 + default: abort();
  3740 + }
  3741 + }
  3742 + dead_tmp(src);
  3743 +}
  3744 +
  3745 +static inline void gen_neon_addl(int size)
  3746 +{
  3747 + switch (size) {
  3748 + case 0: gen_helper_neon_addl_u16(CPU_V001); break;
  3749 + case 1: gen_helper_neon_addl_u32(CPU_V001); break;
  3750 + case 2: tcg_gen_add_i64(CPU_V001); break;
  3751 + default: abort();
  3752 + }
  3753 +}
  3754 +
  3755 +static inline void gen_neon_subl(int size)
  3756 +{
  3757 + switch (size) {
  3758 + case 0: gen_helper_neon_subl_u16(CPU_V001); break;
  3759 + case 1: gen_helper_neon_subl_u32(CPU_V001); break;
  3760 + case 2: tcg_gen_sub_i64(CPU_V001); break;
  3761 + default: abort();
  3762 + }
  3763 +}
  3764 +
  3765 +static inline void gen_neon_negl(TCGv var, int size)
  3766 +{
  3767 + switch (size) {
  3768 + case 0: gen_helper_neon_negl_u16(var, var); break;
  3769 + case 1: gen_helper_neon_negl_u32(var, var); break;
  3770 + case 2: gen_helper_neon_negl_u64(var, var); break;
  3771 + default: abort();
  3772 + }
  3773 +}
  3774 +
  3775 +static inline void gen_neon_addl_saturate(TCGv op0, TCGv op1, int size)
  3776 +{
  3777 + switch (size) {
  3778 + case 1: gen_helper_neon_addl_saturate_s32(op0, cpu_env, op0, op1); break;
  3779 + case 2: gen_helper_neon_addl_saturate_s64(op0, cpu_env, op0, op1); break;
  3780 + default: abort();
  3781 + }
  3782 +}
  3783 +
  3784 +static inline void gen_neon_mull(TCGv dest, TCGv a, TCGv b, int size, int u)
  3785 +{
  3786 + TCGv tmp;
  3787 +
  3788 + switch ((size << 1) | u) {
  3789 + case 0: gen_helper_neon_mull_s8(dest, a, b); break;
  3790 + case 1: gen_helper_neon_mull_u8(dest, a, b); break;
  3791 + case 2: gen_helper_neon_mull_s16(dest, a, b); break;
  3792 + case 3: gen_helper_neon_mull_u16(dest, a, b); break;
  3793 + case 4:
  3794 + tmp = gen_muls_i64_i32(a, b);
  3795 + tcg_gen_mov_i64(dest, tmp);
  3796 + break;
  3797 + case 5:
  3798 + tmp = gen_mulu_i64_i32(a, b);
  3799 + tcg_gen_mov_i64(dest, tmp);
  3800 + break;
  3801 + default: abort();
  3802 + }
  3803 + if (size < 2) {
  3804 + dead_tmp(b);
  3805 + dead_tmp(a);
  3806 + }
  3807 +}
  3808 +
3710 /* Translate a NEON data processing instruction. Return nonzero if the 3809 /* Translate a NEON data processing instruction. Return nonzero if the
3711 instruction is invalid. 3810 instruction is invalid.
3712 - In general we process vectors in 32-bit chunks. This means we can reuse  
3713 - some of the scalar ops, and hopefully the code generated for 32-bit  
3714 - hosts won't be too awful. The downside is that the few 64-bit operations  
3715 - (mainly shifts) get complicated. */ 3811 + We process data in a mixture of 32-bit and 64-bit chunks.
  3812 + Mostly we use 32-bit chunks so we can use normal scalar instructions. */
3716 3813
3717 static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) 3814 static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3718 { 3815 {
@@ -3742,41 +3839,70 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3742,41 +3839,70 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3742 if ((insn & (1 << 23)) == 0) { 3839 if ((insn & (1 << 23)) == 0) {
3743 /* Three register same length. */ 3840 /* Three register same length. */
3744 op = ((insn >> 7) & 0x1e) | ((insn >> 4) & 1); 3841 op = ((insn >> 7) & 0x1e) | ((insn >> 4) & 1);
3745 - if (size == 3 && (op == 1 || op == 5 || op == 16)) { 3842 + if (size == 3 && (op == 1 || op == 5 || op == 8 || op == 9
  3843 + || op == 10 || op == 11 || op == 16)) {
  3844 + /* 64-bit element instructions. */
3746 for (pass = 0; pass < (q ? 2 : 1); pass++) { 3845 for (pass = 0; pass < (q ? 2 : 1); pass++) {
3747 - NEON_GET_REG(T0, rm, pass * 2);  
3748 - NEON_GET_REG(T1, rm, pass * 2 + 1);  
3749 - gen_neon_movl_scratch_T0(0);  
3750 - gen_neon_movl_scratch_T1(1);  
3751 - NEON_GET_REG(T0, rn, pass * 2);  
3752 - NEON_GET_REG(T1, rn, pass * 2 + 1); 3846 + neon_load_reg64(cpu_V0, rn + pass);
  3847 + neon_load_reg64(cpu_V1, rm + pass);
3753 switch (op) { 3848 switch (op) {
3754 case 1: /* VQADD */ 3849 case 1: /* VQADD */
3755 if (u) { 3850 if (u) {
3756 - gen_op_neon_addl_saturate_u64(); 3851 + gen_helper_neon_add_saturate_u64(CPU_V001);
3757 } else { 3852 } else {
3758 - gen_op_neon_addl_saturate_s64(); 3853 + gen_helper_neon_add_saturate_s64(CPU_V001);
3759 } 3854 }
3760 break; 3855 break;
3761 case 5: /* VQSUB */ 3856 case 5: /* VQSUB */
3762 if (u) { 3857 if (u) {
3763 - gen_op_neon_subl_saturate_u64(); 3858 + gen_helper_neon_sub_saturate_u64(CPU_V001);
3764 } else { 3859 } else {
3765 - gen_op_neon_subl_saturate_s64(); 3860 + gen_helper_neon_sub_saturate_s64(CPU_V001);
  3861 + }
  3862 + break;
  3863 + case 8: /* VSHL */
  3864 + if (u) {
  3865 + gen_helper_neon_shl_u64(cpu_V0, cpu_V1, cpu_V0);
  3866 + } else {
  3867 + gen_helper_neon_shl_s64(cpu_V0, cpu_V1, cpu_V0);
  3868 + }
  3869 + break;
  3870 + case 9: /* VQSHL */
  3871 + if (u) {
  3872 + gen_helper_neon_qshl_u64(cpu_V0, cpu_env,
  3873 + cpu_V0, cpu_V0);
  3874 + } else {
  3875 + gen_helper_neon_qshl_s64(cpu_V1, cpu_env,
  3876 + cpu_V1, cpu_V0);
  3877 + }
  3878 + break;
  3879 + case 10: /* VRSHL */
  3880 + if (u) {
  3881 + gen_helper_neon_rshl_u64(cpu_V0, cpu_V1, cpu_V0);
  3882 + } else {
  3883 + gen_helper_neon_rshl_s64(cpu_V0, cpu_V1, cpu_V0);
  3884 + }
  3885 + break;
  3886 + case 11: /* VQRSHL */
  3887 + if (u) {
  3888 + gen_helper_neon_qrshl_u64(cpu_V0, cpu_env,
  3889 + cpu_V1, cpu_V0);
  3890 + } else {
  3891 + gen_helper_neon_qrshl_s64(cpu_V0, cpu_env,
  3892 + cpu_V1, cpu_V0);
3766 } 3893 }
3767 break; 3894 break;
3768 case 16: 3895 case 16:
3769 if (u) { 3896 if (u) {
3770 - gen_op_neon_subl_u64(); 3897 + tcg_gen_sub_i64(CPU_V001);
3771 } else { 3898 } else {
3772 - gen_op_neon_addl_u64(); 3899 + tcg_gen_add_i64(CPU_V001);
3773 } 3900 }
3774 break; 3901 break;
3775 default: 3902 default:
3776 abort(); 3903 abort();
3777 } 3904 }
3778 - NEON_SET_REG(T0, rd, pass * 2);  
3779 - NEON_SET_REG(T1, rd, pass * 2 + 1); 3905 + neon_store_reg64(cpu_V0, rd + pass);
3780 } 3906 }
3781 return 0; 3907 return 0;
3782 } 3908 }
@@ -3784,13 +3910,13 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3784,13 +3910,13 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3784 case 8: /* VSHL */ 3910 case 8: /* VSHL */
3785 case 9: /* VQSHL */ 3911 case 9: /* VQSHL */
3786 case 10: /* VRSHL */ 3912 case 10: /* VRSHL */
3787 - case 11: /* VQSHL */  
3788 - /* Shift operations have Rn and Rm reversed. */ 3913 + case 11: /* VQRSHL */
3789 { 3914 {
3790 - int tmp;  
3791 - tmp = rn; 3915 + int rtmp;
  3916 + /* Shift instruction operands are reversed. */
  3917 + rtmp = rn;
3792 rn = rm; 3918 rn = rm;
3793 - rm = tmp; 3919 + rm = rtmp;
3794 pairwise = 0; 3920 pairwise = 0;
3795 } 3921 }
3796 break; 3922 break;
@@ -3834,19 +3960,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3834,19 +3960,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3834 GEN_NEON_INTEGER_OP(hadd); 3960 GEN_NEON_INTEGER_OP(hadd);
3835 break; 3961 break;
3836 case 1: /* VQADD */ 3962 case 1: /* VQADD */
3837 - switch (size << 1| u) {  
3838 - case 0: gen_op_neon_qadd_s8(); break;  
3839 - case 1: gen_op_neon_qadd_u8(); break;  
3840 - case 2: gen_op_neon_qadd_s16(); break;  
3841 - case 3: gen_op_neon_qadd_u16(); break;  
3842 - case 4:  
3843 - gen_helper_add_saturate(cpu_T[0], cpu_T[0], cpu_T[1]);  
3844 - break;  
3845 - case 5:  
3846 - gen_helper_add_usaturate(cpu_T[0], cpu_T[0], cpu_T[1]);  
3847 - break;  
3848 - default: abort();  
3849 - } 3963 + GEN_NEON_INTEGER_OP_ENV(qadd);
3850 break; 3964 break;
3851 case 2: /* VRHADD */ 3965 case 2: /* VRHADD */
3852 GEN_NEON_INTEGER_OP(rhadd); 3966 GEN_NEON_INTEGER_OP(rhadd);
@@ -3890,19 +4004,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3890,19 +4004,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3890 GEN_NEON_INTEGER_OP(hsub); 4004 GEN_NEON_INTEGER_OP(hsub);
3891 break; 4005 break;
3892 case 5: /* VQSUB */ 4006 case 5: /* VQSUB */
3893 - switch ((size << 1) | u) {  
3894 - case 0: gen_op_neon_qsub_s8(); break;  
3895 - case 1: gen_op_neon_qsub_u8(); break;  
3896 - case 2: gen_op_neon_qsub_s16(); break;  
3897 - case 3: gen_op_neon_qsub_u16(); break;  
3898 - case 4:  
3899 - gen_helper_sub_saturate(cpu_T[0], cpu_T[0], cpu_T[1]);  
3900 - break;  
3901 - case 5:  
3902 - gen_helper_sub_usaturate(cpu_T[0], cpu_T[0], cpu_T[1]);  
3903 - break;  
3904 - default: abort();  
3905 - } 4007 + GEN_NEON_INTEGER_OP_ENV(qsub);
3906 break; 4008 break;
3907 case 6: /* VCGT */ 4009 case 6: /* VCGT */
3908 GEN_NEON_INTEGER_OP(cgt); 4010 GEN_NEON_INTEGER_OP(cgt);
@@ -3911,76 +4013,16 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3911,76 +4013,16 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
3911 GEN_NEON_INTEGER_OP(cge); 4013 GEN_NEON_INTEGER_OP(cge);
3912 break; 4014 break;
3913 case 8: /* VSHL */ 4015 case 8: /* VSHL */
3914 - switch ((size << 1) | u) {  
3915 - case 0: gen_op_neon_shl_s8(); break;  
3916 - case 1: gen_op_neon_shl_u8(); break;  
3917 - case 2: gen_op_neon_shl_s16(); break;  
3918 - case 3: gen_op_neon_shl_u16(); break;  
3919 - case 4: gen_op_neon_shl_s32(); break;  
3920 - case 5: gen_op_neon_shl_u32(); break;  
3921 -#if 0  
3922 - /* ??? Implementing these is tricky because the vector ops work  
3923 - on 32-bit pieces. */  
3924 - case 6: gen_op_neon_shl_s64(); break;  
3925 - case 7: gen_op_neon_shl_u64(); break;  
3926 -#else  
3927 - case 6: case 7: cpu_abort(env, "VSHL.64 not implemented");  
3928 -#endif  
3929 - } 4016 + GEN_NEON_INTEGER_OP(shl);
3930 break; 4017 break;
3931 case 9: /* VQSHL */ 4018 case 9: /* VQSHL */
3932 - switch ((size << 1) | u) {  
3933 - case 0: gen_op_neon_qshl_s8(); break;  
3934 - case 1: gen_op_neon_qshl_u8(); break;  
3935 - case 2: gen_op_neon_qshl_s16(); break;  
3936 - case 3: gen_op_neon_qshl_u16(); break;  
3937 - case 4: gen_op_neon_qshl_s32(); break;  
3938 - case 5: gen_op_neon_qshl_u32(); break;  
3939 -#if 0  
3940 - /* ??? Implementing these is tricky because the vector ops work  
3941 - on 32-bit pieces. */  
3942 - case 6: gen_op_neon_qshl_s64(); break;  
3943 - case 7: gen_op_neon_qshl_u64(); break;  
3944 -#else  
3945 - case 6: case 7: cpu_abort(env, "VQSHL.64 not implemented");  
3946 -#endif  
3947 - } 4019 + GEN_NEON_INTEGER_OP_ENV(qshl);
3948 break; 4020 break;
3949 case 10: /* VRSHL */ 4021 case 10: /* VRSHL */
3950 - switch ((size << 1) | u) {  
3951 - case 0: gen_op_neon_rshl_s8(); break;  
3952 - case 1: gen_op_neon_rshl_u8(); break;  
3953 - case 2: gen_op_neon_rshl_s16(); break;  
3954 - case 3: gen_op_neon_rshl_u16(); break;  
3955 - case 4: gen_op_neon_rshl_s32(); break;  
3956 - case 5: gen_op_neon_rshl_u32(); break;  
3957 -#if 0  
3958 - /* ??? Implementing these is tricky because the vector ops work  
3959 - on 32-bit pieces. */  
3960 - case 6: gen_op_neon_rshl_s64(); break;  
3961 - case 7: gen_op_neon_rshl_u64(); break;  
3962 -#else  
3963 - case 6: case 7: cpu_abort(env, "VRSHL.64 not implemented");  
3964 -#endif  
3965 - } 4022 + GEN_NEON_INTEGER_OP(rshl);
3966 break; 4023 break;
3967 case 11: /* VQRSHL */ 4024 case 11: /* VQRSHL */
3968 - switch ((size << 1) | u) {  
3969 - case 0: gen_op_neon_qrshl_s8(); break;  
3970 - case 1: gen_op_neon_qrshl_u8(); break;  
3971 - case 2: gen_op_neon_qrshl_s16(); break;  
3972 - case 3: gen_op_neon_qrshl_u16(); break;  
3973 - case 4: gen_op_neon_qrshl_s32(); break;  
3974 - case 5: gen_op_neon_qrshl_u32(); break;  
3975 -#if 0  
3976 - /* ??? Implementing these is tricky because the vector ops work  
3977 - on 32-bit pieces. */  
3978 - case 6: gen_op_neon_qrshl_s64(); break;  
3979 - case 7: gen_op_neon_qrshl_u64(); break;  
3980 -#else  
3981 - case 6: case 7: cpu_abort(env, "VQRSHL.64 not implemented");  
3982 -#endif  
3983 - } 4025 + GEN_NEON_INTEGER_OP_ENV(qrshl);
3984 break; 4026 break;
3985 case 12: /* VMAX */ 4027 case 12: /* VMAX */
3986 GEN_NEON_INTEGER_OP(max); 4028 GEN_NEON_INTEGER_OP(max);
@@ -4002,8 +4044,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4002,8 +4044,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4002 return 1; 4044 return 1;
4003 } else { /* VSUB */ 4045 } else { /* VSUB */
4004 switch (size) { 4046 switch (size) {
4005 - case 0: gen_op_neon_sub_u8(); break;  
4006 - case 1: gen_op_neon_sub_u16(); break; 4047 + case 0: gen_helper_neon_sub_u8(CPU_T001); break;
  4048 + case 1: gen_helper_neon_sub_u16(CPU_T001); break;
4007 case 2: gen_op_subl_T0_T1(); break; 4049 case 2: gen_op_subl_T0_T1(); break;
4008 default: return 1; 4050 default: return 1;
4009 } 4051 }
@@ -4012,46 +4054,41 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4012,46 +4054,41 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4012 case 17: 4054 case 17:
4013 if (!u) { /* VTST */ 4055 if (!u) { /* VTST */
4014 switch (size) { 4056 switch (size) {
4015 - case 0: gen_op_neon_tst_u8(); break;  
4016 - case 1: gen_op_neon_tst_u16(); break;  
4017 - case 2: gen_op_neon_tst_u32(); break; 4057 + case 0: gen_helper_neon_tst_u8(CPU_T001); break;
  4058 + case 1: gen_helper_neon_tst_u16(CPU_T001); break;
  4059 + case 2: gen_helper_neon_tst_u32(CPU_T001); break;
4018 default: return 1; 4060 default: return 1;
4019 } 4061 }
4020 } else { /* VCEQ */ 4062 } else { /* VCEQ */
4021 switch (size) { 4063 switch (size) {
4022 - case 0: gen_op_neon_ceq_u8(); break;  
4023 - case 1: gen_op_neon_ceq_u16(); break;  
4024 - case 2: gen_op_neon_ceq_u32(); break; 4064 + case 0: gen_helper_neon_ceq_u8(CPU_T001); break;
  4065 + case 1: gen_helper_neon_ceq_u16(CPU_T001); break;
  4066 + case 2: gen_helper_neon_ceq_u32(CPU_T001); break;
4025 default: return 1; 4067 default: return 1;
4026 } 4068 }
4027 } 4069 }
4028 break; 4070 break;
4029 case 18: /* Multiply. */ 4071 case 18: /* Multiply. */
4030 switch (size) { 4072 switch (size) {
4031 - case 0: gen_op_neon_mul_u8(); break;  
4032 - case 1: gen_op_neon_mul_u16(); break; 4073 + case 0: gen_helper_neon_mul_u8(CPU_T001); break;
  4074 + case 1: gen_helper_neon_mul_u16(CPU_T001); break;
4033 case 2: gen_op_mul_T0_T1(); break; 4075 case 2: gen_op_mul_T0_T1(); break;
4034 default: return 1; 4076 default: return 1;
4035 } 4077 }
4036 NEON_GET_REG(T1, rd, pass); 4078 NEON_GET_REG(T1, rd, pass);
4037 if (u) { /* VMLS */ 4079 if (u) { /* VMLS */
4038 - switch (size) {  
4039 - case 0: gen_op_neon_rsb_u8(); break;  
4040 - case 1: gen_op_neon_rsb_u16(); break;  
4041 - case 2: gen_op_rsbl_T0_T1(); break;  
4042 - default: return 1;  
4043 - } 4080 + gen_neon_rsb(size);
4044 } else { /* VMLA */ 4081 } else { /* VMLA */
4045 gen_neon_add(size); 4082 gen_neon_add(size);
4046 } 4083 }
4047 break; 4084 break;
4048 case 19: /* VMUL */ 4085 case 19: /* VMUL */
4049 if (u) { /* polynomial */ 4086 if (u) { /* polynomial */
4050 - gen_op_neon_mul_p8(); 4087 + gen_helper_neon_mul_p8(CPU_T001);
4051 } else { /* Integer */ 4088 } else { /* Integer */
4052 switch (size) { 4089 switch (size) {
4053 - case 0: gen_op_neon_mul_u8(); break;  
4054 - case 1: gen_op_neon_mul_u16(); break; 4090 + case 0: gen_helper_neon_mul_u8(CPU_T001); break;
  4091 + case 1: gen_helper_neon_mul_u16(CPU_T001); break;
4055 case 2: gen_op_mul_T0_T1(); break; 4092 case 2: gen_op_mul_T0_T1(); break;
4056 default: return 1; 4093 default: return 1;
4057 } 4094 }
@@ -4066,14 +4103,14 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4066,14 +4103,14 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4066 case 22: /* Hultiply high. */ 4103 case 22: /* Hultiply high. */
4067 if (!u) { /* VQDMULH */ 4104 if (!u) { /* VQDMULH */
4068 switch (size) { 4105 switch (size) {
4069 - case 1: gen_op_neon_qdmulh_s16(); break;  
4070 - case 2: gen_op_neon_qdmulh_s32(); break; 4106 + case 1: gen_helper_neon_qdmulh_s16(CPU_T0E01); break;
  4107 + case 2: gen_helper_neon_qdmulh_s32(CPU_T0E01); break;
4071 default: return 1; 4108 default: return 1;
4072 } 4109 }
4073 } else { /* VQRDHMUL */ 4110 } else { /* VQRDHMUL */
4074 switch (size) { 4111 switch (size) {
4075 - case 1: gen_op_neon_qrdmulh_s16(); break;  
4076 - case 2: gen_op_neon_qrdmulh_s32(); break; 4112 + case 1: gen_helper_neon_qrdmulh_s16(CPU_T0E01); break;
  4113 + case 2: gen_helper_neon_qrdmulh_s32(CPU_T0E01); break;
4077 default: return 1; 4114 default: return 1;
4078 } 4115 }
4079 } 4116 }
@@ -4082,8 +4119,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4082,8 +4119,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4082 if (u) 4119 if (u)
4083 return 1; 4120 return 1;
4084 switch (size) { 4121 switch (size) {
4085 - case 0: gen_op_neon_padd_u8(); break;  
4086 - case 1: gen_op_neon_padd_u16(); break; 4122 + case 0: gen_helper_neon_padd_u8(CPU_T001); break;
  4123 + case 1: gen_helper_neon_padd_u16(CPU_T001); break;
4087 case 2: gen_op_addl_T0_T1(); break; 4124 case 2: gen_op_addl_T0_T1(); break;
4088 default: return 1; 4125 default: return 1;
4089 } 4126 }
@@ -4091,55 +4128,55 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4091,55 +4128,55 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4091 case 26: /* Floating point arithnetic. */ 4128 case 26: /* Floating point arithnetic. */
4092 switch ((u << 2) | size) { 4129 switch ((u << 2) | size) {
4093 case 0: /* VADD */ 4130 case 0: /* VADD */
4094 - gen_op_neon_add_f32(); 4131 + gen_helper_neon_add_f32(CPU_T001);
4095 break; 4132 break;
4096 case 2: /* VSUB */ 4133 case 2: /* VSUB */
4097 - gen_op_neon_sub_f32(); 4134 + gen_helper_neon_sub_f32(CPU_T001);
4098 break; 4135 break;
4099 case 4: /* VPADD */ 4136 case 4: /* VPADD */
4100 - gen_op_neon_add_f32(); 4137 + gen_helper_neon_add_f32(CPU_T001);
4101 break; 4138 break;
4102 case 6: /* VABD */ 4139 case 6: /* VABD */
4103 - gen_op_neon_abd_f32(); 4140 + gen_helper_neon_abd_f32(CPU_T001);
4104 break; 4141 break;
4105 default: 4142 default:
4106 return 1; 4143 return 1;
4107 } 4144 }
4108 break; 4145 break;
4109 case 27: /* Float multiply. */ 4146 case 27: /* Float multiply. */
4110 - gen_op_neon_mul_f32(); 4147 + gen_helper_neon_mul_f32(CPU_T001);
4111 if (!u) { 4148 if (!u) {
4112 NEON_GET_REG(T1, rd, pass); 4149 NEON_GET_REG(T1, rd, pass);
4113 if (size == 0) { 4150 if (size == 0) {
4114 - gen_op_neon_add_f32(); 4151 + gen_helper_neon_add_f32(CPU_T001);
4115 } else { 4152 } else {
4116 - gen_op_neon_rsb_f32(); 4153 + gen_helper_neon_sub_f32(cpu_T[0], cpu_T[1], cpu_T[0]);
4117 } 4154 }
4118 } 4155 }
4119 break; 4156 break;
4120 case 28: /* Float compare. */ 4157 case 28: /* Float compare. */
4121 if (!u) { 4158 if (!u) {
4122 - gen_op_neon_ceq_f32(); 4159 + gen_helper_neon_ceq_f32(CPU_T001);
4123 } else { 4160 } else {
4124 if (size == 0) 4161 if (size == 0)
4125 - gen_op_neon_cge_f32(); 4162 + gen_helper_neon_cge_f32(CPU_T001);
4126 else 4163 else
4127 - gen_op_neon_cgt_f32(); 4164 + gen_helper_neon_cgt_f32(CPU_T001);
4128 } 4165 }
4129 break; 4166 break;
4130 case 29: /* Float compare absolute. */ 4167 case 29: /* Float compare absolute. */
4131 if (!u) 4168 if (!u)
4132 return 1; 4169 return 1;
4133 if (size == 0) 4170 if (size == 0)
4134 - gen_op_neon_acge_f32(); 4171 + gen_helper_neon_acge_f32(CPU_T001);
4135 else 4172 else
4136 - gen_op_neon_acgt_f32(); 4173 + gen_helper_neon_acgt_f32(CPU_T001);
4137 break; 4174 break;
4138 case 30: /* Float min/max. */ 4175 case 30: /* Float min/max. */
4139 if (size == 0) 4176 if (size == 0)
4140 - gen_op_neon_max_f32(); 4177 + gen_helper_neon_max_f32(CPU_T001);
4141 else 4178 else
4142 - gen_op_neon_min_f32(); 4179 + gen_helper_neon_min_f32(CPU_T001);
4143 break; 4180 break;
4144 case 31: 4181 case 31:
4145 if (size == 0) 4182 if (size == 0)
@@ -4166,6 +4203,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4166,6 +4203,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4166 NEON_SET_REG(T0, rd, pass); 4203 NEON_SET_REG(T0, rd, pass);
4167 } 4204 }
4168 } 4205 }
  4206 + /* End of 3 register same size operations. */
4169 } else if (insn & (1 << 4)) { 4207 } else if (insn & (1 << 4)) {
4170 if ((insn & 0x00380080) != 0) { 4208 if ((insn & 0x00380080) != 0) {
4171 /* Two registers and shift. */ 4209 /* Two registers and shift. */
@@ -4212,181 +4250,221 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4212,181 +4250,221 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4212 } 4250 }
4213 4251
4214 for (pass = 0; pass < count; pass++) { 4252 for (pass = 0; pass < count; pass++) {
4215 - if (size < 3) {  
4216 - /* Operands in T0 and T1. */  
4217 - gen_op_movl_T1_im(imm);  
4218 - NEON_GET_REG(T0, rm, pass);  
4219 - } else {  
4220 - /* Operands in {T0, T1} and env->vfp.scratch. */  
4221 - gen_op_movl_T0_im(imm);  
4222 - gen_neon_movl_scratch_T0(0);  
4223 - gen_op_movl_T0_im((int32_t)imm >> 31);  
4224 - gen_neon_movl_scratch_T0(1);  
4225 - NEON_GET_REG(T0, rm, pass * 2);  
4226 - NEON_GET_REG(T1, rm, pass * 2 + 1);  
4227 - }  
4228 -  
4229 - if (gen_neon_shift_im[op][u][size] == NULL)  
4230 - return 1;  
4231 - gen_neon_shift_im[op][u][size]();  
4232 -  
4233 - if (op == 1 || op == 3) {  
4234 - /* Accumulate. */  
4235 - if (size == 3) {  
4236 - gen_neon_movl_scratch_T0(0);  
4237 - gen_neon_movl_scratch_T1(1);  
4238 - NEON_GET_REG(T0, rd, pass * 2);  
4239 - NEON_GET_REG(T1, rd, pass * 2 + 1);  
4240 - gen_op_neon_addl_u64();  
4241 - } else {  
4242 - NEON_GET_REG(T1, rd, pass);  
4243 - gen_neon_add(size);  
4244 - }  
4245 - } else if (op == 4 || (op == 5 && u)) {  
4246 - /* Insert */  
4247 - if (size == 3) {  
4248 - cpu_abort(env, "VS[LR]I.64 not implemented");  
4249 - }  
4250 - switch (size) {  
4251 - case 0:  
4252 - if (op == 4)  
4253 - imm = 0xff >> -shift; 4253 + if (size == 3) {
  4254 + neon_load_reg64(cpu_V0, rm + pass);
  4255 + tcg_gen_movi_i64(cpu_V1, imm);
  4256 + switch (op) {
  4257 + case 0: /* VSHR */
  4258 + case 1: /* VSRA */
  4259 + if (u)
  4260 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, cpu_V1);
4254 else 4261 else
4255 - imm = (uint8_t)(0xff << shift);  
4256 - imm |= imm << 8;  
4257 - imm |= imm << 16; 4262 + gen_helper_neon_shl_s64(cpu_V0, cpu_V0, cpu_V1);
4258 break; 4263 break;
4259 - case 1:  
4260 - if (op == 4)  
4261 - imm = 0xffff >> -shift; 4264 + case 2: /* VRSHR */
  4265 + case 3: /* VRSRA */
  4266 + if (u)
  4267 + gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, cpu_V1);
4262 else 4268 else
4263 - imm = (uint16_t)(0xffff << shift);  
4264 - imm |= imm << 16; 4269 + gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, cpu_V1);
4265 break; 4270 break;
4266 - case 2:  
4267 - if (op == 4)  
4268 - imm = 0xffffffffu >> -shift; 4271 + case 4: /* VSRI */
  4272 + if (!u)
  4273 + return 1;
  4274 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, cpu_V1);
  4275 + break;
  4276 + case 5: /* VSHL, VSLI */
  4277 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, cpu_V1);
  4278 + break;
  4279 + case 6: /* VQSHL */
  4280 + if (u)
  4281 + gen_helper_neon_qshl_u64(cpu_V0, cpu_env, cpu_V0, cpu_V1);
4269 else 4282 else
4270 - imm = 0xffffffffu << shift; 4283 + gen_helper_neon_qshl_s64(cpu_V0, cpu_env, cpu_V0, cpu_V1);
  4284 + break;
  4285 + case 7: /* VQSHLU */
  4286 + gen_helper_neon_qshl_u64(cpu_V0, cpu_env, cpu_V0, cpu_V1);
4271 break; 4287 break;
4272 - default:  
4273 - abort();  
4274 } 4288 }
4275 - tmp = neon_load_reg(rd, pass);  
4276 - tcg_gen_andi_i32(cpu_T[0], cpu_T[0], imm);  
4277 - tcg_gen_andi_i32(tmp, tmp, ~imm);  
4278 - tcg_gen_or_i32(cpu_T[0], cpu_T[0], tmp);  
4279 - }  
4280 - if (size == 3) {  
4281 - NEON_SET_REG(T0, rd, pass * 2);  
4282 - NEON_SET_REG(T1, rd, pass * 2 + 1);  
4283 - } else { 4289 + if (op == 1 || op == 3) {
  4290 + /* Accumulate. */
  4291 + neon_load_reg64(cpu_V0, rd + pass);
  4292 + tcg_gen_add_i64(cpu_V0, cpu_V0, cpu_V1);
  4293 + } else if (op == 4 || (op == 5 && u)) {
  4294 + /* Insert */
  4295 + cpu_abort(env, "VS[LR]I.64 not implemented");
  4296 + }
  4297 + neon_store_reg64(cpu_V0, rd + pass);
  4298 + } else { /* size < 3 */
  4299 + /* Operands in T0 and T1. */
  4300 + gen_op_movl_T1_im(imm);
  4301 + NEON_GET_REG(T0, rm, pass);
  4302 + switch (op) {
  4303 + case 0: /* VSHR */
  4304 + case 1: /* VSRA */
  4305 + GEN_NEON_INTEGER_OP(shl);
  4306 + break;
  4307 + case 2: /* VRSHR */
  4308 + case 3: /* VRSRA */
  4309 + GEN_NEON_INTEGER_OP(rshl);
  4310 + break;
  4311 + case 4: /* VSRI */
  4312 + if (!u)
  4313 + return 1;
  4314 + GEN_NEON_INTEGER_OP(shl);
  4315 + break;
  4316 + case 5: /* VSHL, VSLI */
  4317 + switch (size) {
  4318 + case 0: gen_helper_neon_shl_u8(CPU_T001); break;
  4319 + case 1: gen_helper_neon_shl_u16(CPU_T001); break;
  4320 + case 2: gen_helper_neon_shl_u32(CPU_T001); break;
  4321 + default: return 1;
  4322 + }
  4323 + break;
  4324 + case 6: /* VQSHL */
  4325 + GEN_NEON_INTEGER_OP_ENV(qshl);
  4326 + break;
  4327 + case 7: /* VQSHLU */
  4328 + switch (size) {
  4329 + case 0: gen_helper_neon_qshl_u8(CPU_T0E01); break;
  4330 + case 1: gen_helper_neon_qshl_u16(CPU_T0E01); break;
  4331 + case 2: gen_helper_neon_qshl_u32(CPU_T0E01); break;
  4332 + default: return 1;
  4333 + }
  4334 + break;
  4335 + }
  4336 +
  4337 + if (op == 1 || op == 3) {
  4338 + /* Accumulate. */
  4339 + NEON_GET_REG(T1, rd, pass);
  4340 + gen_neon_add(size);
  4341 + } else if (op == 4 || (op == 5 && u)) {
  4342 + /* Insert */
  4343 + switch (size) {
  4344 + case 0:
  4345 + if (op == 4)
  4346 + imm = 0xff >> -shift;
  4347 + else
  4348 + imm = (uint8_t)(0xff << shift);
  4349 + imm |= imm << 8;
  4350 + imm |= imm << 16;
  4351 + break;
  4352 + case 1:
  4353 + if (op == 4)
  4354 + imm = 0xffff >> -shift;
  4355 + else
  4356 + imm = (uint16_t)(0xffff << shift);
  4357 + imm |= imm << 16;
  4358 + break;
  4359 + case 2:
  4360 + if (op == 4)
  4361 + imm = 0xffffffffu >> -shift;
  4362 + else
  4363 + imm = 0xffffffffu << shift;
  4364 + break;
  4365 + default:
  4366 + abort();
  4367 + }
  4368 + tmp = neon_load_reg(rd, pass);
  4369 + tcg_gen_andi_i32(cpu_T[0], cpu_T[0], imm);
  4370 + tcg_gen_andi_i32(tmp, tmp, ~imm);
  4371 + tcg_gen_or_i32(cpu_T[0], cpu_T[0], tmp);
  4372 + }
4284 NEON_SET_REG(T0, rd, pass); 4373 NEON_SET_REG(T0, rd, pass);
4285 } 4374 }
4286 } /* for pass */ 4375 } /* for pass */
4287 } else if (op < 10) { 4376 } else if (op < 10) {
4288 - /* Shift by immedaiate and narrow: 4377 + /* Shift by immediate and narrow:
4289 VSHRN, VRSHRN, VQSHRN, VQRSHRN. */ 4378 VSHRN, VRSHRN, VQSHRN, VQRSHRN. */
4290 shift = shift - (1 << (size + 3)); 4379 shift = shift - (1 << (size + 3));
4291 size++; 4380 size++;
4292 - if (size == 3) {  
4293 - count = q + 1;  
4294 - } else {  
4295 - count = q ? 4: 2;  
4296 - }  
4297 switch (size) { 4381 switch (size) {
4298 case 1: 4382 case 1:
4299 - imm = (uint16_t) shift; 4383 + imm = (uint16_t)shift;
4300 imm |= imm << 16; 4384 imm |= imm << 16;
  4385 + tmp2 = tcg_const_i32(imm);
4301 break; 4386 break;
4302 case 2: 4387 case 2:
  4388 + imm = (uint32_t)shift;
  4389 + tmp2 = tcg_const_i32(imm);
4303 case 3: 4390 case 3:
4304 - imm = shift; 4391 + tmp2 = tcg_const_i64(shift);
4305 break; 4392 break;
4306 default: 4393 default:
4307 abort(); 4394 abort();
4308 } 4395 }
4309 4396
4310 - /* Processing MSB first means we need to do less shuffling at  
4311 - the end. */  
4312 - for (pass = count - 1; pass >= 0; pass--) {  
4313 - /* Avoid clobbering the second operand before it has been  
4314 - written. */  
4315 - n = pass;  
4316 - if (rd == rm)  
4317 - n ^= (count - 1);  
4318 - else  
4319 - n = pass;  
4320 -  
4321 - if (size < 3) {  
4322 - /* Operands in T0 and T1. */  
4323 - gen_op_movl_T1_im(imm);  
4324 - NEON_GET_REG(T0, rm, n); 4397 + for (pass = 0; pass < 2; pass++) {
  4398 + if (size == 3) {
  4399 + neon_load_reg64(cpu_V0, rm + pass);
  4400 + if (q) {
  4401 + if (u)
  4402 + gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp2);
  4403 + else
  4404 + gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp2);
  4405 + } else {
  4406 + if (u)
  4407 + gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp2);
  4408 + else
  4409 + gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp2);
  4410 + }
4325 } else { 4411 } else {
4326 - /* Operands in {T0, T1} and env->vfp.scratch. */  
4327 - gen_op_movl_T0_im(imm);  
4328 - gen_neon_movl_scratch_T0(0);  
4329 - gen_op_movl_T0_im((int32_t)imm >> 31);  
4330 - gen_neon_movl_scratch_T0(1);  
4331 - NEON_GET_REG(T0, rm, n * 2);  
4332 - NEON_GET_REG(T0, rm, n * 2 + 1); 4412 + tmp = neon_load_reg(rm + pass, 0);
  4413 + gen_neon_shift_narrow(size, tmp, tmp2, q, u);
  4414 + tcg_gen_extu_i32_i64(cpu_V0, tmp);
  4415 + dead_tmp(tmp);
  4416 + tmp = neon_load_reg(rm + pass, 1);
  4417 + gen_neon_shift_narrow(size, tmp, tmp2, q, u);
  4418 + tcg_gen_extu_i32_i64(cpu_V1, tmp);
  4419 + dead_tmp(tmp);
  4420 + tcg_gen_shli_i64(cpu_V1, cpu_V1, 32);
  4421 + tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
4333 } 4422 }
4334 -  
4335 - gen_neon_shift_im_narrow[q][u][size - 1]();  
4336 -  
4337 - if (size < 3 && (pass & 1) == 0) {  
4338 - gen_neon_movl_scratch_T0(0); 4423 + tmp = new_tmp();
  4424 + if (op == 8 && !u) {
  4425 + gen_neon_narrow(size - 1, tmp, cpu_V0);
4339 } else { 4426 } else {
4340 - uint32_t offset;  
4341 -  
4342 - if (size < 3)  
4343 - gen_neon_movl_T1_scratch(0);  
4344 -  
4345 - if (op == 8 && !u) {  
4346 - gen_neon_narrow[size - 1]();  
4347 - } else {  
4348 - if (op == 8)  
4349 - gen_neon_narrow_sats[size - 2]();  
4350 - else  
4351 - gen_neon_narrow_satu[size - 1]();  
4352 - }  
4353 - if (size == 3)  
4354 - offset = neon_reg_offset(rd, n); 4427 + if (op == 8)
  4428 + gen_neon_narrow_sats(size - 1, tmp, cpu_V0);
4355 else 4429 else
4356 - offset = neon_reg_offset(rd, n >> 1);  
4357 - gen_op_neon_setreg_T0(offset); 4430 + gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
  4431 + }
  4432 + if (pass == 0) {
  4433 + tmp2 = tmp;
  4434 + } else {
  4435 + neon_store_reg(rd, 0, tmp2);
  4436 + neon_store_reg(rd, 1, tmp);
4358 } 4437 }
4359 } /* for pass */ 4438 } /* for pass */
4360 } else if (op == 10) { 4439 } else if (op == 10) {
4361 /* VSHLL */ 4440 /* VSHLL */
4362 - if (q) 4441 + if (q || size == 3)
4363 return 1; 4442 return 1;
  4443 + tmp = neon_load_reg(rm, 0);
  4444 + tmp2 = neon_load_reg(rm, 1);
4364 for (pass = 0; pass < 2; pass++) { 4445 for (pass = 0; pass < 2; pass++) {
4365 - /* Avoid clobbering the input operand. */  
4366 - if (rd == rm)  
4367 - n = 1 - pass;  
4368 - else  
4369 - n = pass; 4446 + if (pass == 1)
  4447 + tmp = tmp2;
  4448 +
  4449 + gen_neon_widen(cpu_V0, tmp, size, u);
4370 4450
4371 - NEON_GET_REG(T0, rm, n);  
4372 - GEN_NEON_INTEGER_OP(widen);  
4373 if (shift != 0) { 4451 if (shift != 0) {
4374 /* The shift is less than the width of the source 4452 /* The shift is less than the width of the source
4375 - type, so in some cases we can just  
4376 - shift the whole register. */  
4377 - if (size == 1 || (size == 0 && u)) {  
4378 - gen_op_shll_T0_im(shift);  
4379 - gen_op_shll_T1_im(shift);  
4380 - } else {  
4381 - switch (size) {  
4382 - case 0: gen_op_neon_shll_u16(shift); break;  
4383 - case 2: gen_op_neon_shll_u64(shift); break;  
4384 - default: abort(); 4453 + type, so we can just shift the whole register. */
  4454 + tcg_gen_shli_i64(cpu_V0, cpu_V0, shift);
  4455 + if (size < 2 || !u) {
  4456 + uint64_t imm64;
  4457 + if (size == 0) {
  4458 + imm = (0xffu >> (8 - shift));
  4459 + imm |= imm << 16;
  4460 + } else {
  4461 + imm = 0xffff >> (16 - shift);
4385 } 4462 }
  4463 + imm64 = imm | (((uint64_t)imm) << 32);
  4464 + tcg_gen_andi_i64(cpu_V0, cpu_V0, imm64);
4386 } 4465 }
4387 } 4466 }
4388 - NEON_SET_REG(T0, rd, n * 2);  
4389 - NEON_SET_REG(T1, rd, n * 2 + 1); 4467 + neon_store_reg64(cpu_V0, rd + pass);
4390 } 4468 }
4391 } else if (op == 15 || op == 16) { 4469 } else if (op == 15 || op == 16) {
4392 /* VCVT fixed-point. */ 4470 /* VCVT fixed-point. */
@@ -4458,28 +4536,30 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4458,28 +4536,30 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4458 4536
4459 for (pass = 0; pass < (q ? 4 : 2); pass++) { 4537 for (pass = 0; pass < (q ? 4 : 2); pass++) {
4460 if (op & 1 && op < 12) { 4538 if (op & 1 && op < 12) {
4461 - NEON_GET_REG(T0, rd, pass); 4539 + tmp = neon_load_reg(rd, pass);
4462 if (invert) { 4540 if (invert) {
4463 /* The immediate value has already been inverted, so 4541 /* The immediate value has already been inverted, so
4464 BIC becomes AND. */ 4542 BIC becomes AND. */
4465 - gen_op_andl_T0_T1(); 4543 + tcg_gen_andi_i32(tmp, tmp, imm);
4466 } else { 4544 } else {
4467 - gen_op_orl_T0_T1(); 4545 + tcg_gen_ori_i32(tmp, tmp, imm);
4468 } 4546 }
4469 - NEON_SET_REG(T0, rd, pass);  
4470 } else { 4547 } else {
  4548 + /* VMOV, VMVN. */
  4549 + tmp = new_tmp();
4471 if (op == 14 && invert) { 4550 if (op == 14 && invert) {
4472 - uint32_t tmp;  
4473 - tmp = 0; 4551 + uint32_t val;
  4552 + val = 0;
4474 for (n = 0; n < 4; n++) { 4553 for (n = 0; n < 4; n++) {
4475 if (imm & (1 << (n + (pass & 1) * 4))) 4554 if (imm & (1 << (n + (pass & 1) * 4)))
4476 - tmp |= 0xff << (n * 8); 4555 + val |= 0xff << (n * 8);
4477 } 4556 }
4478 - gen_op_movl_T1_im(tmp); 4557 + tcg_gen_movi_i32(tmp, val);
  4558 + } else {
  4559 + tcg_gen_movi_i32(tmp, imm);
4479 } 4560 }
4480 - /* VMOV, VMVN. */  
4481 - NEON_SET_REG(T1, rd, pass);  
4482 } 4561 }
  4562 + neon_store_reg(rd, pass, tmp);
4483 } 4563 }
4484 } 4564 }
4485 } else { /* (insn & 0x00800010 == 0x00800010) */ 4565 } else { /* (insn & 0x00800010 == 0x00800010) */
@@ -4513,6 +4593,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4513,6 +4593,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4513 src1_wide = neon_3reg_wide[op][1]; 4593 src1_wide = neon_3reg_wide[op][1];
4514 src2_wide = neon_3reg_wide[op][2]; 4594 src2_wide = neon_3reg_wide[op][2];
4515 4595
  4596 + if (size == 0 && (op == 9 || op == 11 || op == 13))
  4597 + return 1;
  4598 +
4516 /* Avoid overlapping operands. Wide source operands are 4599 /* Avoid overlapping operands. Wide source operands are
4517 always aligned so will never overlap with wide 4600 always aligned so will never overlap with wide
4518 destinations in problematic ways. */ 4601 destinations in problematic ways. */
@@ -4524,87 +4607,69 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4524,87 +4607,69 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4524 gen_neon_movl_scratch_T0(2); 4607 gen_neon_movl_scratch_T0(2);
4525 } 4608 }
4526 for (pass = 0; pass < 2; pass++) { 4609 for (pass = 0; pass < 2; pass++) {
4527 - /* Load the second operand into env->vfp.scratch.  
4528 - Also widen narrow operands. */  
4529 - if (src2_wide) {  
4530 - NEON_GET_REG(T0, rm, pass * 2);  
4531 - NEON_GET_REG(T1, rm, pass * 2 + 1); 4610 + if (src1_wide) {
  4611 + neon_load_reg64(cpu_V0, rn + pass);
4532 } else { 4612 } else {
4533 - if (pass == 1 && rd == rm) {  
4534 - if (prewiden) {  
4535 - gen_neon_movl_T0_scratch(2);  
4536 - } else {  
4537 - gen_neon_movl_T1_scratch(2);  
4538 - } 4613 + if (pass == 1 && rd == rn) {
  4614 + gen_neon_movl_T0_scratch(2);
  4615 + tmp = new_tmp();
  4616 + tcg_gen_mov_i32(tmp, cpu_T[0]);
4539 } else { 4617 } else {
4540 - if (prewiden) {  
4541 - NEON_GET_REG(T0, rm, pass);  
4542 - } else {  
4543 - NEON_GET_REG(T1, rm, pass);  
4544 - } 4618 + tmp = neon_load_reg(rn, pass);
  4619 + }
  4620 + if (prewiden) {
  4621 + gen_neon_widen(cpu_V0, tmp, size, u);
4545 } 4622 }
4546 } 4623 }
4547 - if (prewiden && !src2_wide) {  
4548 - GEN_NEON_INTEGER_OP(widen);  
4549 - }  
4550 - if (prewiden || src2_wide) {  
4551 - gen_neon_movl_scratch_T0(0);  
4552 - gen_neon_movl_scratch_T1(1);  
4553 - }  
4554 -  
4555 - /* Load the first operand. */  
4556 - if (src1_wide) {  
4557 - NEON_GET_REG(T0, rn, pass * 2);  
4558 - NEON_GET_REG(T1, rn, pass * 2 + 1); 4624 + if (src2_wide) {
  4625 + neon_load_reg64(cpu_V1, rm + pass);
4559 } else { 4626 } else {
4560 - if (pass == 1 && rd == rn) { 4627 + if (pass == 1 && rd == rm) {
4561 gen_neon_movl_T0_scratch(2); 4628 gen_neon_movl_T0_scratch(2);
  4629 + tmp2 = new_tmp();
  4630 + tcg_gen_mov_i32(tmp2, cpu_T[0]);
4562 } else { 4631 } else {
4563 - NEON_GET_REG(T0, rn, pass); 4632 + tmp2 = neon_load_reg(rm, pass);
  4633 + }
  4634 + if (prewiden) {
  4635 + gen_neon_widen(cpu_V1, tmp2, size, u);
4564 } 4636 }
4565 - }  
4566 - if (prewiden && !src1_wide) {  
4567 - GEN_NEON_INTEGER_OP(widen);  
4568 } 4637 }
4569 switch (op) { 4638 switch (op) {
4570 case 0: case 1: case 4: /* VADDL, VADDW, VADDHN, VRADDHN */ 4639 case 0: case 1: case 4: /* VADDL, VADDW, VADDHN, VRADDHN */
4571 - switch (size) {  
4572 - case 0: gen_op_neon_addl_u16(); break;  
4573 - case 1: gen_op_neon_addl_u32(); break;  
4574 - case 2: gen_op_neon_addl_u64(); break;  
4575 - default: abort();  
4576 - } 4640 + gen_neon_addl(size);
4577 break; 4641 break;
4578 case 2: case 3: case 6: /* VSUBL, VSUBW, VSUBHL, VRSUBHL */ 4642 case 2: case 3: case 6: /* VSUBL, VSUBW, VSUBHL, VRSUBHL */
4579 - switch (size) {  
4580 - case 0: gen_op_neon_subl_u16(); break;  
4581 - case 1: gen_op_neon_subl_u32(); break;  
4582 - case 2: gen_op_neon_subl_u64(); break;  
4583 - default: abort();  
4584 - } 4643 + gen_neon_subl(size);
4585 break; 4644 break;
4586 case 5: case 7: /* VABAL, VABDL */ 4645 case 5: case 7: /* VABAL, VABDL */
4587 switch ((size << 1) | u) { 4646 switch ((size << 1) | u) {
4588 - case 0: gen_op_neon_abdl_s16(); break;  
4589 - case 1: gen_op_neon_abdl_u16(); break;  
4590 - case 2: gen_op_neon_abdl_s32(); break;  
4591 - case 3: gen_op_neon_abdl_u32(); break;  
4592 - case 4: gen_op_neon_abdl_s64(); break;  
4593 - case 5: gen_op_neon_abdl_u64(); break; 4647 + case 0:
  4648 + gen_helper_neon_abdl_s16(cpu_V0, tmp, tmp2);
  4649 + break;
  4650 + case 1:
  4651 + gen_helper_neon_abdl_u16(cpu_V0, tmp, tmp2);
  4652 + break;
  4653 + case 2:
  4654 + gen_helper_neon_abdl_s32(cpu_V0, tmp, tmp2);
  4655 + break;
  4656 + case 3:
  4657 + gen_helper_neon_abdl_u32(cpu_V0, tmp, tmp2);
  4658 + break;
  4659 + case 4:
  4660 + gen_helper_neon_abdl_s64(cpu_V0, tmp, tmp2);
  4661 + break;
  4662 + case 5:
  4663 + gen_helper_neon_abdl_u64(cpu_V0, tmp, tmp2);
  4664 + break;
4594 default: abort(); 4665 default: abort();
4595 } 4666 }
  4667 + dead_tmp(tmp2);
  4668 + dead_tmp(tmp);
4596 break; 4669 break;
4597 case 8: case 9: case 10: case 11: case 12: case 13: 4670 case 8: case 9: case 10: case 11: case 12: case 13:
4598 /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */ 4671 /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */
4599 - switch ((size << 1) | u) {  
4600 - case 0: gen_op_neon_mull_s8(); break;  
4601 - case 1: gen_op_neon_mull_u8(); break;  
4602 - case 2: gen_op_neon_mull_s16(); break;  
4603 - case 3: gen_op_neon_mull_u16(); break;  
4604 - case 4: gen_op_imull_T0_T1(); break;  
4605 - case 5: gen_op_mull_T0_T1(); break;  
4606 - default: abort();  
4607 - } 4672 + gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
4608 break; 4673 break;
4609 case 14: /* Polynomial VMULL */ 4674 case 14: /* Polynomial VMULL */
4610 cpu_abort(env, "Polynomial VMULL not implemented"); 4675 cpu_abort(env, "Polynomial VMULL not implemented");
@@ -4615,72 +4680,71 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4615,72 +4680,71 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4615 if (op == 5 || op == 13 || (op >= 8 && op <= 11)) { 4680 if (op == 5 || op == 13 || (op >= 8 && op <= 11)) {
4616 /* Accumulate. */ 4681 /* Accumulate. */
4617 if (op == 10 || op == 11) { 4682 if (op == 10 || op == 11) {
4618 - switch (size) {  
4619 - case 0: gen_op_neon_negl_u16(); break;  
4620 - case 1: gen_op_neon_negl_u32(); break;  
4621 - case 2: gen_op_neon_negl_u64(); break;  
4622 - default: abort();  
4623 - } 4683 + gen_neon_negl(cpu_V0, size);
4624 } 4684 }
4625 4685
4626 - gen_neon_movl_scratch_T0(0);  
4627 - gen_neon_movl_scratch_T1(1);  
4628 -  
4629 if (op != 13) { 4686 if (op != 13) {
4630 - NEON_GET_REG(T0, rd, pass * 2);  
4631 - NEON_GET_REG(T1, rd, pass * 2 + 1); 4687 + neon_load_reg64(cpu_V1, rd + pass);
4632 } 4688 }
4633 4689
4634 switch (op) { 4690 switch (op) {
4635 case 5: case 8: case 10: /* VABAL, VMLAL, VMLSL */ 4691 case 5: case 8: case 10: /* VABAL, VMLAL, VMLSL */
4636 - switch (size) {  
4637 - case 0: gen_op_neon_addl_u16(); break;  
4638 - case 1: gen_op_neon_addl_u32(); break;  
4639 - case 2: gen_op_neon_addl_u64(); break;  
4640 - default: abort();  
4641 - } 4692 + gen_neon_addl(size);
4642 break; 4693 break;
4643 case 9: case 11: /* VQDMLAL, VQDMLSL */ 4694 case 9: case 11: /* VQDMLAL, VQDMLSL */
4644 - switch (size) {  
4645 - case 1: gen_op_neon_addl_saturate_s32(); break;  
4646 - case 2: gen_op_neon_addl_saturate_s64(); break;  
4647 - default: abort();  
4648 - } 4695 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
  4696 + gen_neon_addl_saturate(cpu_V0, cpu_V1, size);
  4697 + break;
4649 /* Fall through. */ 4698 /* Fall through. */
4650 case 13: /* VQDMULL */ 4699 case 13: /* VQDMULL */
4651 - switch (size) {  
4652 - case 1: gen_op_neon_addl_saturate_s32(); break;  
4653 - case 2: gen_op_neon_addl_saturate_s64(); break;  
4654 - default: abort();  
4655 - } 4700 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
4656 break; 4701 break;
4657 default: 4702 default:
4658 abort(); 4703 abort();
4659 } 4704 }
4660 - NEON_SET_REG(T0, rd, pass * 2);  
4661 - NEON_SET_REG(T1, rd, pass * 2 + 1); 4705 + neon_store_reg64(cpu_V0, rd + pass);
4662 } else if (op == 4 || op == 6) { 4706 } else if (op == 4 || op == 6) {
4663 /* Narrowing operation. */ 4707 /* Narrowing operation. */
  4708 + tmp = new_tmp();
4664 if (u) { 4709 if (u) {
4665 switch (size) { 4710 switch (size) {
4666 - case 0: gen_op_neon_narrow_high_u8(); break;  
4667 - case 1: gen_op_neon_narrow_high_u16(); break;  
4668 - case 2: gen_op_movl_T0_T1(); break; 4711 + case 0:
  4712 + gen_helper_neon_narrow_high_u8(tmp, cpu_V0);
  4713 + break;
  4714 + case 1:
  4715 + gen_helper_neon_narrow_high_u16(tmp, cpu_V0);
  4716 + break;
  4717 + case 2:
  4718 + tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
  4719 + tcg_gen_trunc_i64_i32(tmp, cpu_V0);
  4720 + break;
4669 default: abort(); 4721 default: abort();
4670 } 4722 }
4671 } else { 4723 } else {
4672 switch (size) { 4724 switch (size) {
4673 - case 0: gen_op_neon_narrow_high_round_u8(); break;  
4674 - case 1: gen_op_neon_narrow_high_round_u16(); break;  
4675 - case 2: gen_op_neon_narrow_high_round_u32(); break; 4725 + case 0:
  4726 + gen_helper_neon_narrow_round_high_u8(tmp, cpu_V0);
  4727 + break;
  4728 + case 1:
  4729 + gen_helper_neon_narrow_round_high_u16(tmp, cpu_V0);
  4730 + break;
  4731 + case 2:
  4732 + tcg_gen_addi_i64(cpu_V0, cpu_V0, 1u << 31);
  4733 + tcg_gen_shri_i64(cpu_V0, cpu_V0, 32);
  4734 + tcg_gen_trunc_i64_i32(tmp, cpu_V0);
  4735 + break;
4676 default: abort(); 4736 default: abort();
4677 } 4737 }
4678 } 4738 }
4679 - NEON_SET_REG(T0, rd, pass); 4739 + if (pass == 0) {
  4740 + tmp3 = tmp;
  4741 + } else {
  4742 + neon_store_reg(rd, 0, tmp3);
  4743 + neon_store_reg(rd, 1, tmp);
  4744 + }
4680 } else { 4745 } else {
4681 /* Write back the result. */ 4746 /* Write back the result. */
4682 - NEON_SET_REG(T0, rd, pass * 2);  
4683 - NEON_SET_REG(T1, rd, pass * 2 + 1); 4747 + neon_store_reg64(cpu_V0, rd + pass);
4684 } 4748 }
4685 } 4749 }
4686 } else { 4750 } else {
@@ -4702,22 +4766,22 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4702,22 +4766,22 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4702 NEON_GET_REG(T1, rn, pass); 4766 NEON_GET_REG(T1, rn, pass);
4703 if (op == 12) { 4767 if (op == 12) {
4704 if (size == 1) { 4768 if (size == 1) {
4705 - gen_op_neon_qdmulh_s16(); 4769 + gen_helper_neon_qdmulh_s16(CPU_T0E01);
4706 } else { 4770 } else {
4707 - gen_op_neon_qdmulh_s32(); 4771 + gen_helper_neon_qdmulh_s32(CPU_T0E01);
4708 } 4772 }
4709 } else if (op == 13) { 4773 } else if (op == 13) {
4710 if (size == 1) { 4774 if (size == 1) {
4711 - gen_op_neon_qrdmulh_s16(); 4775 + gen_helper_neon_qrdmulh_s16(CPU_T0E01);
4712 } else { 4776 } else {
4713 - gen_op_neon_qrdmulh_s32(); 4777 + gen_helper_neon_qrdmulh_s32(CPU_T0E01);
4714 } 4778 }
4715 } else if (op & 1) { 4779 } else if (op & 1) {
4716 - gen_op_neon_mul_f32(); 4780 + gen_helper_neon_mul_f32(CPU_T001);
4717 } else { 4781 } else {
4718 switch (size) { 4782 switch (size) {
4719 - case 0: gen_op_neon_mul_u8(); break;  
4720 - case 1: gen_op_neon_mul_u16(); break; 4783 + case 0: gen_helper_neon_mul_u8(CPU_T001); break;
  4784 + case 1: gen_helper_neon_mul_u16(CPU_T001); break;
4721 case 2: gen_op_mul_T0_T1(); break; 4785 case 2: gen_op_mul_T0_T1(); break;
4722 default: return 1; 4786 default: return 1;
4723 } 4787 }
@@ -4730,18 +4794,13 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4730,18 +4794,13 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4730 gen_neon_add(size); 4794 gen_neon_add(size);
4731 break; 4795 break;
4732 case 1: 4796 case 1:
4733 - gen_op_neon_add_f32(); 4797 + gen_helper_neon_add_f32(CPU_T001);
4734 break; 4798 break;
4735 case 4: 4799 case 4:
4736 - switch (size) {  
4737 - case 0: gen_op_neon_rsb_u8(); break;  
4738 - case 1: gen_op_neon_rsb_u16(); break;  
4739 - case 2: gen_op_rsbl_T0_T1(); break;  
4740 - default: return 1;  
4741 - } 4800 + gen_neon_rsb(size);
4742 break; 4801 break;
4743 case 5: 4802 case 5:
4744 - gen_op_neon_rsb_f32(); 4803 + gen_helper_neon_sub_f32(cpu_T[0], cpu_T[1], cpu_T[0]);
4745 break; 4804 break;
4746 default: 4805 default:
4747 abort(); 4806 abort();
@@ -4756,81 +4815,46 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4756,81 +4815,46 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4756 case 7: /* VQDMLSL scalar */ 4815 case 7: /* VQDMLSL scalar */
4757 case 10: /* VMULL scalar */ 4816 case 10: /* VMULL scalar */
4758 case 11: /* VQDMULL scalar */ 4817 case 11: /* VQDMULL scalar */
4759 - if (rd == rn) {  
4760 - /* Save overlapping operands before they are  
4761 - clobbered. */  
4762 - NEON_GET_REG(T0, rn, 1);  
4763 - gen_neon_movl_scratch_T0(2);  
4764 - } 4818 + if (size == 0 && (op == 3 || op == 7 || op == 11))
  4819 + return 1;
  4820 +
4765 gen_neon_get_scalar(size, rm); 4821 gen_neon_get_scalar(size, rm);
4766 - gen_neon_movl_scratch_T0(3); 4822 + NEON_GET_REG(T1, rn, 1);
  4823 +
4767 for (pass = 0; pass < 2; pass++) { 4824 for (pass = 0; pass < 2; pass++) {
4768 - if (pass != 0) {  
4769 - gen_neon_movl_T0_scratch(3);  
4770 - }  
4771 - if (pass != 0 && rd == rn) {  
4772 - gen_neon_movl_T1_scratch(2); 4825 + if (pass == 0) {
  4826 + tmp = neon_load_reg(rn, 0);
4773 } else { 4827 } else {
4774 - NEON_GET_REG(T1, rn, pass);  
4775 - }  
4776 - switch ((size << 1) | u) {  
4777 - case 0: gen_op_neon_mull_s8(); break;  
4778 - case 1: gen_op_neon_mull_u8(); break;  
4779 - case 2: gen_op_neon_mull_s16(); break;  
4780 - case 3: gen_op_neon_mull_u16(); break;  
4781 - case 4: gen_op_imull_T0_T1(); break;  
4782 - case 5: gen_op_mull_T0_T1(); break;  
4783 - default: abort(); 4828 + tmp = new_tmp();
  4829 + tcg_gen_mov_i32(tmp, cpu_T[1]);
4784 } 4830 }
  4831 + tmp2 = new_tmp();
  4832 + tcg_gen_mov_i32(tmp2, cpu_T[0]);
  4833 + gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
4785 if (op == 6 || op == 7) { 4834 if (op == 6 || op == 7) {
4786 - switch (size) {  
4787 - case 0: gen_op_neon_negl_u16(); break;  
4788 - case 1: gen_op_neon_negl_u32(); break;  
4789 - case 2: gen_op_neon_negl_u64(); break;  
4790 - default: abort();  
4791 - } 4835 + gen_neon_negl(cpu_V0, size);
  4836 + }
  4837 + if (op != 11) {
  4838 + neon_load_reg64(cpu_V1, rd + pass);
4792 } 4839 }
4793 - gen_neon_movl_scratch_T0(0);  
4794 - gen_neon_movl_scratch_T1(1);  
4795 - NEON_GET_REG(T0, rd, pass * 2);  
4796 - NEON_GET_REG(T1, rd, pass * 2 + 1);  
4797 switch (op) { 4840 switch (op) {
4798 case 2: case 6: 4841 case 2: case 6:
4799 - switch (size) {  
4800 - case 0: gen_op_neon_addl_u16(); break;  
4801 - case 1: gen_op_neon_addl_u32(); break;  
4802 - case 2: gen_op_neon_addl_u64(); break;  
4803 - default: abort();  
4804 - } 4842 + gen_neon_addl(size);
4805 break; 4843 break;
4806 case 3: case 7: 4844 case 3: case 7:
4807 - switch (size) {  
4808 - case 1:  
4809 - gen_op_neon_addl_saturate_s32();  
4810 - gen_op_neon_addl_saturate_s32();  
4811 - break;  
4812 - case 2:  
4813 - gen_op_neon_addl_saturate_s64();  
4814 - gen_op_neon_addl_saturate_s64();  
4815 - break;  
4816 - default: abort();  
4817 - } 4845 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
  4846 + gen_neon_addl_saturate(cpu_V0, cpu_V1, size);
4818 break; 4847 break;
4819 case 10: 4848 case 10:
4820 /* no-op */ 4849 /* no-op */
4821 break; 4850 break;
4822 case 11: 4851 case 11:
4823 - switch (size) {  
4824 - case 1: gen_op_neon_addl_saturate_s32(); break;  
4825 - case 2: gen_op_neon_addl_saturate_s64(); break;  
4826 - default: abort();  
4827 - } 4852 + gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
4828 break; 4853 break;
4829 default: 4854 default:
4830 abort(); 4855 abort();
4831 } 4856 }
4832 - NEON_SET_REG(T0, rd, pass * 2);  
4833 - NEON_SET_REG(T1, rd, pass * 2 + 1); 4857 + neon_store_reg64(cpu_V0, rd + pass);
4834 } 4858 }
4835 break; 4859 break;
4836 default: /* 14 and 15 are RESERVED */ 4860 default: /* 14 and 15 are RESERVED */
@@ -4840,29 +4864,53 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4840,29 +4864,53 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4840 } else { /* size == 3 */ 4864 } else { /* size == 3 */
4841 if (!u) { 4865 if (!u) {
4842 /* Extract. */ 4866 /* Extract. */
4843 - int reg;  
4844 imm = (insn >> 8) & 0xf; 4867 imm = (insn >> 8) & 0xf;
4845 - reg = rn;  
4846 - count = q ? 4 : 2;  
4847 - n = imm >> 2;  
4848 - NEON_GET_REG(T0, reg, n);  
4849 - for (pass = 0; pass < count; pass++) {  
4850 - n++;  
4851 - if (n > count) {  
4852 - reg = rm;  
4853 - n -= count; 4868 + count = q + 1;
  4869 +
  4870 + if (imm > 7 && !q)
  4871 + return 1;
  4872 +
  4873 + if (imm == 0) {
  4874 + neon_load_reg64(cpu_V0, rn);
  4875 + if (q) {
  4876 + neon_load_reg64(cpu_V1, rn + 1);
4854 } 4877 }
4855 - if (imm & 3) {  
4856 - NEON_GET_REG(T1, reg, n);  
4857 - gen_op_neon_extract((insn << 3) & 0x1f); 4878 + } else if (imm == 8) {
  4879 + neon_load_reg64(cpu_V0, rn + 1);
  4880 + if (q) {
  4881 + neon_load_reg64(cpu_V1, rm);
4858 } 4882 }
4859 - /* ??? This is broken if rd and rm overlap */  
4860 - NEON_SET_REG(T0, rd, pass);  
4861 - if (imm & 3) {  
4862 - gen_op_movl_T0_T1(); 4883 + } else if (q) {
  4884 + tmp = tcg_temp_new(TCG_TYPE_I64);
  4885 + if (imm < 8) {
  4886 + neon_load_reg64(cpu_V0, rn);
  4887 + neon_load_reg64(tmp, rn + 1);
  4888 + } else {
  4889 + neon_load_reg64(cpu_V0, rn + 1);
  4890 + neon_load_reg64(tmp, rm);
  4891 + }
  4892 + tcg_gen_shri_i64(cpu_V0, cpu_V0, (imm & 7) * 8);
  4893 + tcg_gen_shli_i64(cpu_V1, tmp, 64 - ((imm & 7) * 8));
  4894 + tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
  4895 + if (imm < 8) {
  4896 + neon_load_reg64(cpu_V1, rm);
4863 } else { 4897 } else {
4864 - NEON_GET_REG(T0, reg, n); 4898 + neon_load_reg64(cpu_V1, rm + 1);
  4899 + imm -= 8;
4865 } 4900 }
  4901 + tcg_gen_shli_i64(cpu_V1, cpu_V1, 64 - (imm * 8));
  4902 + tcg_gen_shri_i64(tmp, tmp, imm * 8);
  4903 + tcg_gen_or_i64(cpu_V1, cpu_V1, tmp);
  4904 + } else {
  4905 + neon_load_reg64(cpu_V0, rn);
  4906 + tcg_gen_shri_i32(cpu_V0, cpu_V0, imm * 8);
  4907 + neon_load_reg64(cpu_V1, rm);
  4908 + tcg_gen_shli_i32(cpu_V1, cpu_V1, 64 - (imm * 8));
  4909 + tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
  4910 + }
  4911 + neon_store_reg64(cpu_V0, rd);
  4912 + if (q) {
  4913 + neon_store_reg64(cpu_V1, rd + 1);
4866 } 4914 }
4867 } else if ((insn & (1 << 11)) == 0) { 4915 } else if ((insn & (1 << 11)) == 0) {
4868 /* Two register misc. */ 4916 /* Two register misc. */
@@ -4897,28 +4945,25 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4897,28 +4945,25 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4897 break; 4945 break;
4898 case 4: case 5: /* VPADDL */ 4946 case 4: case 5: /* VPADDL */
4899 case 12: case 13: /* VPADAL */ 4947 case 12: case 13: /* VPADAL */
4900 - if (size < 2)  
4901 - goto elementwise;  
4902 if (size == 3) 4948 if (size == 3)
4903 return 1; 4949 return 1;
4904 - for (pass = 0; pass < (q ? 2 : 1); pass++) {  
4905 - NEON_GET_REG(T0, rm, pass * 2);  
4906 - NEON_GET_REG(T1, rm, pass * 2 + 1);  
4907 - if (op & 1)  
4908 - gen_op_neon_paddl_u32();  
4909 - else  
4910 - gen_op_neon_paddl_s32(); 4950 + for (pass = 0; pass < q + 1; pass++) {
  4951 + tmp = neon_load_reg(rm, pass * 2);
  4952 + gen_neon_widen(cpu_V0, tmp, size, op & 1);
  4953 + tmp = neon_load_reg(rm, pass * 2 + 1);
  4954 + gen_neon_widen(cpu_V1, tmp, size, op & 1);
  4955 + switch (size) {
  4956 + case 0: gen_helper_neon_paddl_u16(CPU_V001); break;
  4957 + case 1: gen_helper_neon_paddl_u32(CPU_V001); break;
  4958 + case 2: tcg_gen_add_i64(CPU_V001); break;
  4959 + default: abort();
  4960 + }
4911 if (op >= 12) { 4961 if (op >= 12) {
4912 /* Accumulate. */ 4962 /* Accumulate. */
4913 - gen_neon_movl_scratch_T0(0);  
4914 - gen_neon_movl_scratch_T1(1);  
4915 -  
4916 - NEON_GET_REG(T0, rd, pass * 2);  
4917 - NEON_GET_REG(T1, rd, pass * 2 + 1);  
4918 - gen_op_neon_addl_u64(); 4963 + neon_load_reg64(cpu_V1, rd + pass);
  4964 + gen_neon_addl(size);
4919 } 4965 }
4920 - NEON_SET_REG(T0, rd, pass * 2);  
4921 - NEON_SET_REG(T1, rd, pass * 2 + 1); 4966 + neon_store_reg64(cpu_V0, rd + pass);
4922 } 4967 }
4923 break; 4968 break;
4924 case 33: /* VTRN */ 4969 case 33: /* VTRN */
@@ -4972,8 +5017,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4972,8 +5017,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4972 NEON_GET_REG(T0, rd, n); 5017 NEON_GET_REG(T0, rd, n);
4973 NEON_GET_REG(T1, rd, n); 5018 NEON_GET_REG(T1, rd, n);
4974 switch (size) { 5019 switch (size) {
4975 - case 0: gen_op_neon_zip_u8(); break;  
4976 - case 1: gen_op_neon_zip_u16(); break; 5020 + case 0: gen_helper_neon_zip_u8(); break;
  5021 + case 1: gen_helper_neon_zip_u16(); break;
4977 case 2: /* no-op */; break; 5022 case 2: /* no-op */; break;
4978 default: abort(); 5023 default: abort();
4979 } 5024 }
@@ -4987,63 +5032,36 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -4987,63 +5032,36 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
4987 } 5032 }
4988 break; 5033 break;
4989 case 36: case 37: /* VMOVN, VQMOVUN, VQMOVN */ 5034 case 36: case 37: /* VMOVN, VQMOVUN, VQMOVN */
  5035 + if (size == 3)
  5036 + return 1;
4990 for (pass = 0; pass < 2; pass++) { 5037 for (pass = 0; pass < 2; pass++) {
4991 - if (rd == rm + 1) {  
4992 - n = 1 - pass;  
4993 - } else {  
4994 - n = pass;  
4995 - }  
4996 - NEON_GET_REG(T0, rm, n * 2);  
4997 - NEON_GET_REG(T1, rm, n * 2 + 1); 5038 + neon_load_reg64(cpu_V0, rm + pass);
  5039 + tmp = new_tmp();
4998 if (op == 36 && q == 0) { 5040 if (op == 36 && q == 0) {
4999 - switch (size) {  
5000 - case 0: gen_op_neon_narrow_u8(); break;  
5001 - case 1: gen_op_neon_narrow_u16(); break;  
5002 - case 2: /* no-op */ break;  
5003 - default: return 1;  
5004 - } 5041 + gen_neon_narrow(size, tmp, cpu_V0);
5005 } else if (q) { 5042 } else if (q) {
5006 - switch (size) {  
5007 - case 0: gen_op_neon_narrow_sat_u8(); break;  
5008 - case 1: gen_op_neon_narrow_sat_u16(); break;  
5009 - case 2: gen_op_neon_narrow_sat_u32(); break;  
5010 - default: return 1;  
5011 - } 5043 + gen_neon_narrow_satu(size, tmp, cpu_V0);
5012 } else { 5044 } else {
5013 - switch (size) {  
5014 - case 0: gen_op_neon_narrow_sat_s8(); break;  
5015 - case 1: gen_op_neon_narrow_sat_s16(); break;  
5016 - case 2: gen_op_neon_narrow_sat_s32(); break;  
5017 - default: return 1;  
5018 - } 5045 + gen_neon_narrow_sats(size, tmp, cpu_V0);
  5046 + }
  5047 + if (pass == 0) {
  5048 + tmp2 = tmp;
  5049 + } else {
  5050 + neon_store_reg(rd, 0, tmp2);
  5051 + neon_store_reg(rd, 1, tmp);
5019 } 5052 }
5020 - NEON_SET_REG(T0, rd, n);  
5021 } 5053 }
5022 break; 5054 break;
5023 case 38: /* VSHLL */ 5055 case 38: /* VSHLL */
5024 - if (q) 5056 + if (q || size == 3)
5025 return 1; 5057 return 1;
5026 - if (rm == rd) {  
5027 - NEON_GET_REG(T0, rm, 1);  
5028 - gen_neon_movl_scratch_T0(0);  
5029 - } 5058 + tmp = neon_load_reg(rm, 0);
  5059 + tmp2 = neon_load_reg(rm, 1);
5030 for (pass = 0; pass < 2; pass++) { 5060 for (pass = 0; pass < 2; pass++) {
5031 - if (pass == 1 && rm == rd) {  
5032 - gen_neon_movl_T0_scratch(0);  
5033 - } else {  
5034 - NEON_GET_REG(T0, rm, pass);  
5035 - }  
5036 - switch (size) {  
5037 - case 0: gen_op_neon_widen_high_u8(); break;  
5038 - case 1: gen_op_neon_widen_high_u16(); break;  
5039 - case 2:  
5040 - gen_op_movl_T1_T0();  
5041 - gen_op_movl_T0_im(0);  
5042 - break;  
5043 - default: return 1;  
5044 - }  
5045 - NEON_SET_REG(T0, rd, pass * 2);  
5046 - NEON_SET_REG(T1, rd, pass * 2 + 1); 5061 + if (pass == 1)
  5062 + tmp = tmp2;
  5063 + gen_neon_widen(cpu_V0, tmp, size, 1);
  5064 + neon_store_reg64(cpu_V0, rd + pass);
5047 } 5065 }
5048 break; 5066 break;
5049 default: 5067 default:
@@ -5068,37 +5086,18 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5068,37 +5086,18 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5068 return 1; 5086 return 1;
5069 gen_rev16(cpu_T[0]); 5087 gen_rev16(cpu_T[0]);
5070 break; 5088 break;
5071 - case 4: case 5: /* VPADDL */  
5072 - case 12: case 13: /* VPADAL */  
5073 - switch ((size << 1) | (op & 1)) {  
5074 - case 0: gen_op_neon_paddl_s8(); break;  
5075 - case 1: gen_op_neon_paddl_u8(); break;  
5076 - case 2: gen_op_neon_paddl_s16(); break;  
5077 - case 3: gen_op_neon_paddl_u16(); break;  
5078 - default: abort();  
5079 - }  
5080 - if (op >= 12) {  
5081 - /* Accumulate */  
5082 - NEON_GET_REG(T1, rd, pass);  
5083 - switch (size) {  
5084 - case 0: gen_op_neon_add_u16(); break;  
5085 - case 1: gen_op_addl_T0_T1(); break;  
5086 - default: abort();  
5087 - }  
5088 - }  
5089 - break;  
5090 case 8: /* CLS */ 5089 case 8: /* CLS */
5091 switch (size) { 5090 switch (size) {
5092 - case 0: gen_op_neon_cls_s8(); break;  
5093 - case 1: gen_op_neon_cls_s16(); break;  
5094 - case 2: gen_op_neon_cls_s32(); break; 5091 + case 0: gen_helper_neon_cls_s8(cpu_T[0], cpu_T[0]); break;
  5092 + case 1: gen_helper_neon_cls_s16(cpu_T[0], cpu_T[0]); break;
  5093 + case 2: gen_helper_neon_cls_s32(cpu_T[0], cpu_T[0]); break;
5095 default: return 1; 5094 default: return 1;
5096 } 5095 }
5097 break; 5096 break;
5098 case 9: /* CLZ */ 5097 case 9: /* CLZ */
5099 switch (size) { 5098 switch (size) {
5100 - case 0: gen_op_neon_clz_u8(); break;  
5101 - case 1: gen_op_neon_clz_u16(); break; 5099 + case 0: gen_helper_neon_clz_u8(cpu_T[0], cpu_T[0]); break;
  5100 + case 1: gen_helper_neon_clz_u16(cpu_T[0], cpu_T[0]); break;
5102 case 2: gen_helper_clz(cpu_T[0], cpu_T[0]); break; 5101 case 2: gen_helper_clz(cpu_T[0], cpu_T[0]); break;
5103 default: return 1; 5102 default: return 1;
5104 } 5103 }
@@ -5106,7 +5105,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5106,7 +5105,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5106 case 10: /* CNT */ 5105 case 10: /* CNT */
5107 if (size != 0) 5106 if (size != 0)
5108 return 1; 5107 return 1;
5109 - gen_op_neon_cnt_u8(); 5108 + gen_helper_neon_cnt_u8(cpu_T[0], cpu_T[0]);
5110 break; 5109 break;
5111 case 11: /* VNOT */ 5110 case 11: /* VNOT */
5112 if (size != 0) 5111 if (size != 0)
@@ -5115,26 +5114,26 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5115,26 +5114,26 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5115 break; 5114 break;
5116 case 14: /* VQABS */ 5115 case 14: /* VQABS */
5117 switch (size) { 5116 switch (size) {
5118 - case 0: gen_op_neon_qabs_s8(); break;  
5119 - case 1: gen_op_neon_qabs_s16(); break;  
5120 - case 2: gen_op_neon_qabs_s32(); break; 5117 + case 0: gen_helper_neon_qabs_s8(cpu_T[0], cpu_env, cpu_T[0]); break;
  5118 + case 1: gen_helper_neon_qabs_s16(cpu_T[0], cpu_env, cpu_T[0]); break;
  5119 + case 2: gen_helper_neon_qabs_s32(cpu_T[0], cpu_env, cpu_T[0]); break;
5121 default: return 1; 5120 default: return 1;
5122 } 5121 }
5123 break; 5122 break;
5124 case 15: /* VQNEG */ 5123 case 15: /* VQNEG */
5125 switch (size) { 5124 switch (size) {
5126 - case 0: gen_op_neon_qneg_s8(); break;  
5127 - case 1: gen_op_neon_qneg_s16(); break;  
5128 - case 2: gen_op_neon_qneg_s32(); break; 5125 + case 0: gen_helper_neon_qneg_s8(cpu_T[0], cpu_env, cpu_T[0]); break;
  5126 + case 1: gen_helper_neon_qneg_s16(cpu_T[0], cpu_env, cpu_T[0]); break;
  5127 + case 2: gen_helper_neon_qneg_s32(cpu_T[0], cpu_env, cpu_T[0]); break;
5129 default: return 1; 5128 default: return 1;
5130 } 5129 }
5131 break; 5130 break;
5132 case 16: case 19: /* VCGT #0, VCLE #0 */ 5131 case 16: case 19: /* VCGT #0, VCLE #0 */
5133 gen_op_movl_T1_im(0); 5132 gen_op_movl_T1_im(0);
5134 switch(size) { 5133 switch(size) {
5135 - case 0: gen_op_neon_cgt_s8(); break;  
5136 - case 1: gen_op_neon_cgt_s16(); break;  
5137 - case 2: gen_op_neon_cgt_s32(); break; 5134 + case 0: gen_helper_neon_cgt_s8(CPU_T001); break;
  5135 + case 1: gen_helper_neon_cgt_s16(CPU_T001); break;
  5136 + case 2: gen_helper_neon_cgt_s32(CPU_T001); break;
5138 default: return 1; 5137 default: return 1;
5139 } 5138 }
5140 if (op == 19) 5139 if (op == 19)
@@ -5143,9 +5142,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5143,9 +5142,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5143 case 17: case 20: /* VCGE #0, VCLT #0 */ 5142 case 17: case 20: /* VCGE #0, VCLT #0 */
5144 gen_op_movl_T1_im(0); 5143 gen_op_movl_T1_im(0);
5145 switch(size) { 5144 switch(size) {
5146 - case 0: gen_op_neon_cge_s8(); break;  
5147 - case 1: gen_op_neon_cge_s16(); break;  
5148 - case 2: gen_op_neon_cge_s32(); break; 5145 + case 0: gen_helper_neon_cge_s8(CPU_T001); break;
  5146 + case 1: gen_helper_neon_cge_s16(CPU_T001); break;
  5147 + case 2: gen_helper_neon_cge_s32(CPU_T001); break;
5149 default: return 1; 5148 default: return 1;
5150 } 5149 }
5151 if (op == 20) 5150 if (op == 20)
@@ -5154,44 +5153,41 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5154,44 +5153,41 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5154 case 18: /* VCEQ #0 */ 5153 case 18: /* VCEQ #0 */
5155 gen_op_movl_T1_im(0); 5154 gen_op_movl_T1_im(0);
5156 switch(size) { 5155 switch(size) {
5157 - case 0: gen_op_neon_ceq_u8(); break;  
5158 - case 1: gen_op_neon_ceq_u16(); break;  
5159 - case 2: gen_op_neon_ceq_u32(); break; 5156 + case 0: gen_helper_neon_ceq_u8(CPU_T001); break;
  5157 + case 1: gen_helper_neon_ceq_u16(CPU_T001); break;
  5158 + case 2: gen_helper_neon_ceq_u32(CPU_T001); break;
5160 default: return 1; 5159 default: return 1;
5161 } 5160 }
5162 break; 5161 break;
5163 case 22: /* VABS */ 5162 case 22: /* VABS */
5164 switch(size) { 5163 switch(size) {
5165 - case 0: gen_op_neon_abs_s8(); break;  
5166 - case 1: gen_op_neon_abs_s16(); break;  
5167 - case 2: gen_op_neon_abs_s32(); break; 5164 + case 0: gen_helper_neon_abs_s8(cpu_T[0], cpu_T[0]); break;
  5165 + case 1: gen_helper_neon_abs_s16(cpu_T[0], cpu_T[0]); break;
  5166 + case 2: tcg_gen_abs_i32(cpu_T[0], cpu_T[0]); break;
5168 default: return 1; 5167 default: return 1;
5169 } 5168 }
5170 break; 5169 break;
5171 case 23: /* VNEG */ 5170 case 23: /* VNEG */
5172 gen_op_movl_T1_im(0); 5171 gen_op_movl_T1_im(0);
5173 - switch(size) {  
5174 - case 0: gen_op_neon_rsb_u8(); break;  
5175 - case 1: gen_op_neon_rsb_u16(); break;  
5176 - case 2: gen_op_rsbl_T0_T1(); break;  
5177 - default: return 1;  
5178 - } 5172 + if (size == 3)
  5173 + return 1;
  5174 + gen_neon_rsb(size);
5179 break; 5175 break;
5180 case 24: case 27: /* Float VCGT #0, Float VCLE #0 */ 5176 case 24: case 27: /* Float VCGT #0, Float VCLE #0 */
5181 gen_op_movl_T1_im(0); 5177 gen_op_movl_T1_im(0);
5182 - gen_op_neon_cgt_f32(); 5178 + gen_helper_neon_cgt_f32(CPU_T001);
5183 if (op == 27) 5179 if (op == 27)
5184 gen_op_notl_T0(); 5180 gen_op_notl_T0();
5185 break; 5181 break;
5186 case 25: case 28: /* Float VCGE #0, Float VCLT #0 */ 5182 case 25: case 28: /* Float VCGE #0, Float VCLT #0 */
5187 gen_op_movl_T1_im(0); 5183 gen_op_movl_T1_im(0);
5188 - gen_op_neon_cge_f32(); 5184 + gen_helper_neon_cge_f32(CPU_T001);
5189 if (op == 28) 5185 if (op == 28)
5190 gen_op_notl_T0(); 5186 gen_op_notl_T0();
5191 break; 5187 break;
5192 case 26: /* Float VCEQ #0 */ 5188 case 26: /* Float VCEQ #0 */
5193 gen_op_movl_T1_im(0); 5189 gen_op_movl_T1_im(0);
5194 - gen_op_neon_ceq_f32(); 5190 + gen_helper_neon_ceq_f32(CPU_T001);
5195 break; 5191 break;
5196 case 30: /* Float VABS */ 5192 case 30: /* Float VABS */
5197 gen_vfp_abs(0); 5193 gen_vfp_abs(0);
@@ -5206,8 +5202,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5206,8 +5202,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5206 case 33: /* VTRN */ 5202 case 33: /* VTRN */
5207 NEON_GET_REG(T1, rd, pass); 5203 NEON_GET_REG(T1, rd, pass);
5208 switch (size) { 5204 switch (size) {
5209 - case 0: gen_op_neon_trn_u8(); break;  
5210 - case 1: gen_op_neon_trn_u16(); break; 5205 + case 0: gen_helper_neon_trn_u8(); break;
  5206 + case 1: gen_helper_neon_trn_u16(); break;
5211 case 2: abort(); 5207 case 2: abort();
5212 default: return 1; 5208 default: return 1;
5213 } 5209 }
@@ -5281,12 +5277,12 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5281,12 +5277,12 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5281 NEON_SET_REG(T0, rm, 0); 5277 NEON_SET_REG(T0, rm, 0);
5282 } 5278 }
5283 if (insn & (1 << 16)) { 5279 if (insn & (1 << 16)) {
5284 - gen_op_neon_dup_u8(((insn >> 17) & 3) * 8); 5280 + gen_neon_dup_u8(cpu_T[0], ((insn >> 17) & 3) * 8);
5285 } else if (insn & (1 << 17)) { 5281 } else if (insn & (1 << 17)) {
5286 if ((insn >> 18) & 1) 5282 if ((insn >> 18) & 1)
5287 - gen_op_neon_dup_high16(); 5283 + gen_neon_dup_high16(cpu_T[0]);
5288 else 5284 else
5289 - gen_op_neon_dup_low16(); 5285 + gen_neon_dup_low16(cpu_T[0]);
5290 } 5286 }
5291 for (pass = 0; pass < (q ? 4 : 2); pass++) { 5287 for (pass = 0; pass < (q ? 4 : 2); pass++) {
5292 NEON_SET_REG(T0, rd, pass); 5288 NEON_SET_REG(T0, rd, pass);
@@ -8324,6 +8320,8 @@ static inline int gen_intermediate_code_internal(CPUState *env, @@ -8324,6 +8320,8 @@ static inline int gen_intermediate_code_internal(CPUState *env,
8324 cpu_F1s = tcg_temp_new(TCG_TYPE_I32); 8320 cpu_F1s = tcg_temp_new(TCG_TYPE_I32);
8325 cpu_F0d = tcg_temp_new(TCG_TYPE_I64); 8321 cpu_F0d = tcg_temp_new(TCG_TYPE_I64);
8326 cpu_F1d = tcg_temp_new(TCG_TYPE_I64); 8322 cpu_F1d = tcg_temp_new(TCG_TYPE_I64);
  8323 + cpu_V0 = cpu_F0d;
  8324 + cpu_V1 = cpu_F1d;
8327 next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE; 8325 next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
8328 lj = -1; 8326 lj = -1;
8329 /* Reset the conditional execution bits immediately. This avoids 8327 /* Reset the conditional execution bits immediately. This avoids