Commit ad69471ce5e1284e1cacd053bb0fe8d6175a2f9e

Authored by pbrook
1 parent 8f8e3aa4

ARM TCG conversion 14/16.

git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4151 c046a42c-6fe2-441c-8c8c-71466251a162

Too many changes to show.

To preserve performance only 7 of 8 files are displayed.

Makefile.target
@@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o @@ -211,7 +211,7 @@ LIBOBJS+= op_helper.o helper.o
211 endif 211 endif
212 212
213 ifeq ($(TARGET_BASE_ARCH), arm) 213 ifeq ($(TARGET_BASE_ARCH), arm)
214 -LIBOBJS+= op_helper.o helper.o 214 +LIBOBJS+= op_helper.o helper.o neon_helper.o
215 endif 215 endif
216 216
217 ifeq ($(TARGET_BASE_ARCH), sh4) 217 ifeq ($(TARGET_BASE_ARCH), sh4)
target-arm/helper.c
@@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env) @@ -256,30 +256,6 @@ void cpu_arm_close(CPUARMState *env)
256 free(env); 256 free(env);
257 } 257 }
258 258
259 -/* Polynomial multiplication is like integer multiplcation except the  
260 - partial products are XORed, not added. */  
261 -uint32_t helper_neon_mul_p8(uint32_t op1, uint32_t op2)  
262 -{  
263 - uint32_t mask;  
264 - uint32_t result;  
265 - result = 0;  
266 - while (op1) {  
267 - mask = 0;  
268 - if (op1 & 1)  
269 - mask |= 0xff;  
270 - if (op1 & (1 << 8))  
271 - mask |= (0xff << 8);  
272 - if (op1 & (1 << 16))  
273 - mask |= (0xff << 16);  
274 - if (op1 & (1 << 24))  
275 - mask |= (0xff << 24);  
276 - result ^= op2 & mask;  
277 - op1 = (op1 >> 1) & 0x7f7f7f7f;  
278 - op2 = (op2 << 1) & 0xfefefefe;  
279 - }  
280 - return result;  
281 -}  
282 -  
283 uint32_t cpsr_read(CPUARMState *env) 259 uint32_t cpsr_read(CPUARMState *env)
284 { 260 {
285 int ZF; 261 int ZF;
@@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x) @@ -376,6 +352,11 @@ uint32_t HELPER(rbit)(uint32_t x)
376 return x; 352 return x;
377 } 353 }
378 354
  355 +uint32_t HELPER(abs)(uint32_t x)
  356 +{
  357 + return ((int32_t)x < 0) ? -x : x;
  358 +}
  359 +
379 #if defined(CONFIG_USER_ONLY) 360 #if defined(CONFIG_USER_ONLY)
380 361
381 void do_interrupt (CPUState *env) 362 void do_interrupt (CPUState *env)
target-arm/helpers.h
@@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t)) @@ -84,6 +84,7 @@ DEF_HELPER_1_1(double_saturate, uint32_t, (int32_t))
84 DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t)) 84 DEF_HELPER_1_2(sdiv, int32_t, (int32_t, int32_t))
85 DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t)) 85 DEF_HELPER_1_2(udiv, uint32_t, (uint32_t, uint32_t))
86 DEF_HELPER_1_1(rbit, uint32_t, (uint32_t)) 86 DEF_HELPER_1_1(rbit, uint32_t, (uint32_t))
  87 +DEF_HELPER_1_1(abs, uint32_t, (uint32_t))
87 88
88 #define PAS_OP(pfx) \ 89 #define PAS_OP(pfx) \
89 DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \ 90 DEF_HELPER_1_3(pfx ## add8, uint32_t, (uint32_t, uint32_t, uint32_t *)) \
@@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *)) @@ -208,6 +209,10 @@ DEF_HELPER_1_2(rsqrte_f32, float32, (float32, CPUState *))
208 DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *)) 209 DEF_HELPER_1_2(recpe_u32, uint32_t, (uint32_t, CPUState *))
209 DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *)) 210 DEF_HELPER_1_2(rsqrte_u32, uint32_t, (uint32_t, CPUState *))
210 DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t)) 211 DEF_HELPER_1_4(neon_tbl, uint32_t, (uint32_t, uint32_t, uint32_t, uint32_t))
  212 +DEF_HELPER_1_2(neon_add_saturate_u64, uint64_t, (uint64_t, uint64_t))
  213 +DEF_HELPER_1_2(neon_add_saturate_s64, uint64_t, (uint64_t, uint64_t))
  214 +DEF_HELPER_1_2(neon_sub_saturate_u64, uint64_t, (uint64_t, uint64_t))
  215 +DEF_HELPER_1_2(neon_sub_saturate_s64, uint64_t, (uint64_t, uint64_t))
211 216
212 DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t)) 217 DEF_HELPER_1_2(add_cc, uint32_t, (uint32_t, uint32_t))
213 DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t)) 218 DEF_HELPER_1_2(adc_cc, uint32_t, (uint32_t, uint32_t))
@@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t)) @@ -223,6 +228,209 @@ DEF_HELPER_1_2(shr_cc, uint32_t, (uint32_t, uint32_t))
223 DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t)) 228 DEF_HELPER_1_2(sar_cc, uint32_t, (uint32_t, uint32_t))
224 DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t)) 229 DEF_HELPER_1_2(ror_cc, uint32_t, (uint32_t, uint32_t))
225 230
  231 +/* neon_helper.c */
  232 +DEF_HELPER_1_3(neon_qadd_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  233 +DEF_HELPER_1_3(neon_qadd_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  234 +DEF_HELPER_1_3(neon_qadd_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  235 +DEF_HELPER_1_3(neon_qadd_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  236 +DEF_HELPER_1_3(neon_qsub_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  237 +DEF_HELPER_1_3(neon_qsub_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  238 +DEF_HELPER_1_3(neon_qsub_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  239 +DEF_HELPER_1_3(neon_qsub_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  240 +
  241 +DEF_HELPER_1_2(neon_hadd_s8, uint32_t, (uint32_t, uint32_t))
  242 +DEF_HELPER_1_2(neon_hadd_u8, uint32_t, (uint32_t, uint32_t))
  243 +DEF_HELPER_1_2(neon_hadd_s16, uint32_t, (uint32_t, uint32_t))
  244 +DEF_HELPER_1_2(neon_hadd_u16, uint32_t, (uint32_t, uint32_t))
  245 +DEF_HELPER_1_2(neon_hadd_s32, int32_t, (int32_t, int32_t))
  246 +DEF_HELPER_1_2(neon_hadd_u32, uint32_t, (uint32_t, uint32_t))
  247 +DEF_HELPER_1_2(neon_rhadd_s8, uint32_t, (uint32_t, uint32_t))
  248 +DEF_HELPER_1_2(neon_rhadd_u8, uint32_t, (uint32_t, uint32_t))
  249 +DEF_HELPER_1_2(neon_rhadd_s16, uint32_t, (uint32_t, uint32_t))
  250 +DEF_HELPER_1_2(neon_rhadd_u16, uint32_t, (uint32_t, uint32_t))
  251 +DEF_HELPER_1_2(neon_rhadd_s32, int32_t, (int32_t, int32_t))
  252 +DEF_HELPER_1_2(neon_rhadd_u32, uint32_t, (uint32_t, uint32_t))
  253 +DEF_HELPER_1_2(neon_hsub_s8, uint32_t, (uint32_t, uint32_t))
  254 +DEF_HELPER_1_2(neon_hsub_u8, uint32_t, (uint32_t, uint32_t))
  255 +DEF_HELPER_1_2(neon_hsub_s16, uint32_t, (uint32_t, uint32_t))
  256 +DEF_HELPER_1_2(neon_hsub_u16, uint32_t, (uint32_t, uint32_t))
  257 +DEF_HELPER_1_2(neon_hsub_s32, int32_t, (int32_t, int32_t))
  258 +DEF_HELPER_1_2(neon_hsub_u32, uint32_t, (uint32_t, uint32_t))
  259 +
  260 +DEF_HELPER_1_2(neon_cgt_u8, uint32_t, (uint32_t, uint32_t))
  261 +DEF_HELPER_1_2(neon_cgt_s8, uint32_t, (uint32_t, uint32_t))
  262 +DEF_HELPER_1_2(neon_cgt_u16, uint32_t, (uint32_t, uint32_t))
  263 +DEF_HELPER_1_2(neon_cgt_s16, uint32_t, (uint32_t, uint32_t))
  264 +DEF_HELPER_1_2(neon_cgt_u32, uint32_t, (uint32_t, uint32_t))
  265 +DEF_HELPER_1_2(neon_cgt_s32, uint32_t, (uint32_t, uint32_t))
  266 +DEF_HELPER_1_2(neon_cge_u8, uint32_t, (uint32_t, uint32_t))
  267 +DEF_HELPER_1_2(neon_cge_s8, uint32_t, (uint32_t, uint32_t))
  268 +DEF_HELPER_1_2(neon_cge_u16, uint32_t, (uint32_t, uint32_t))
  269 +DEF_HELPER_1_2(neon_cge_s16, uint32_t, (uint32_t, uint32_t))
  270 +DEF_HELPER_1_2(neon_cge_u32, uint32_t, (uint32_t, uint32_t))
  271 +DEF_HELPER_1_2(neon_cge_s32, uint32_t, (uint32_t, uint32_t))
  272 +
  273 +DEF_HELPER_1_2(neon_min_u8, uint32_t, (uint32_t, uint32_t))
  274 +DEF_HELPER_1_2(neon_min_s8, uint32_t, (uint32_t, uint32_t))
  275 +DEF_HELPER_1_2(neon_min_u16, uint32_t, (uint32_t, uint32_t))
  276 +DEF_HELPER_1_2(neon_min_s16, uint32_t, (uint32_t, uint32_t))
  277 +DEF_HELPER_1_2(neon_min_u32, uint32_t, (uint32_t, uint32_t))
  278 +DEF_HELPER_1_2(neon_min_s32, uint32_t, (uint32_t, uint32_t))
  279 +DEF_HELPER_1_2(neon_max_u8, uint32_t, (uint32_t, uint32_t))
  280 +DEF_HELPER_1_2(neon_max_s8, uint32_t, (uint32_t, uint32_t))
  281 +DEF_HELPER_1_2(neon_max_u16, uint32_t, (uint32_t, uint32_t))
  282 +DEF_HELPER_1_2(neon_max_s16, uint32_t, (uint32_t, uint32_t))
  283 +DEF_HELPER_1_2(neon_max_u32, uint32_t, (uint32_t, uint32_t))
  284 +DEF_HELPER_1_2(neon_max_s32, uint32_t, (uint32_t, uint32_t))
  285 +DEF_HELPER_1_2(neon_pmin_u8, uint32_t, (uint32_t, uint32_t))
  286 +DEF_HELPER_1_2(neon_pmin_s8, uint32_t, (uint32_t, uint32_t))
  287 +DEF_HELPER_1_2(neon_pmin_u16, uint32_t, (uint32_t, uint32_t))
  288 +DEF_HELPER_1_2(neon_pmin_s16, uint32_t, (uint32_t, uint32_t))
  289 +DEF_HELPER_1_2(neon_pmin_u32, uint32_t, (uint32_t, uint32_t))
  290 +DEF_HELPER_1_2(neon_pmin_s32, uint32_t, (uint32_t, uint32_t))
  291 +DEF_HELPER_1_2(neon_pmax_u8, uint32_t, (uint32_t, uint32_t))
  292 +DEF_HELPER_1_2(neon_pmax_s8, uint32_t, (uint32_t, uint32_t))
  293 +DEF_HELPER_1_2(neon_pmax_u16, uint32_t, (uint32_t, uint32_t))
  294 +DEF_HELPER_1_2(neon_pmax_s16, uint32_t, (uint32_t, uint32_t))
  295 +DEF_HELPER_1_2(neon_pmax_u32, uint32_t, (uint32_t, uint32_t))
  296 +DEF_HELPER_1_2(neon_pmax_s32, uint32_t, (uint32_t, uint32_t))
  297 +
  298 +DEF_HELPER_1_2(neon_abd_u8, uint32_t, (uint32_t, uint32_t))
  299 +DEF_HELPER_1_2(neon_abd_s8, uint32_t, (uint32_t, uint32_t))
  300 +DEF_HELPER_1_2(neon_abd_u16, uint32_t, (uint32_t, uint32_t))
  301 +DEF_HELPER_1_2(neon_abd_s16, uint32_t, (uint32_t, uint32_t))
  302 +DEF_HELPER_1_2(neon_abd_u32, uint32_t, (uint32_t, uint32_t))
  303 +DEF_HELPER_1_2(neon_abd_s32, uint32_t, (uint32_t, uint32_t))
  304 +
  305 +DEF_HELPER_1_2(neon_shl_u8, uint32_t, (uint32_t, uint32_t))
  306 +DEF_HELPER_1_2(neon_shl_s8, uint32_t, (uint32_t, uint32_t))
  307 +DEF_HELPER_1_2(neon_shl_u16, uint32_t, (uint32_t, uint32_t))
  308 +DEF_HELPER_1_2(neon_shl_s16, uint32_t, (uint32_t, uint32_t))
  309 +DEF_HELPER_1_2(neon_shl_u32, uint32_t, (uint32_t, uint32_t))
  310 +DEF_HELPER_1_2(neon_shl_s32, uint32_t, (uint32_t, uint32_t))
  311 +DEF_HELPER_1_2(neon_shl_u64, uint64_t, (uint64_t, uint64_t))
  312 +DEF_HELPER_1_2(neon_shl_s64, uint64_t, (uint64_t, uint64_t))
  313 +DEF_HELPER_1_2(neon_rshl_u8, uint32_t, (uint32_t, uint32_t))
  314 +DEF_HELPER_1_2(neon_rshl_s8, uint32_t, (uint32_t, uint32_t))
  315 +DEF_HELPER_1_2(neon_rshl_u16, uint32_t, (uint32_t, uint32_t))
  316 +DEF_HELPER_1_2(neon_rshl_s16, uint32_t, (uint32_t, uint32_t))
  317 +DEF_HELPER_1_2(neon_rshl_u32, uint32_t, (uint32_t, uint32_t))
  318 +DEF_HELPER_1_2(neon_rshl_s32, uint32_t, (uint32_t, uint32_t))
  319 +DEF_HELPER_1_2(neon_rshl_u64, uint64_t, (uint64_t, uint64_t))
  320 +DEF_HELPER_1_2(neon_rshl_s64, uint64_t, (uint64_t, uint64_t))
  321 +DEF_HELPER_1_3(neon_qshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  322 +DEF_HELPER_1_3(neon_qshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  323 +DEF_HELPER_1_3(neon_qshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  324 +DEF_HELPER_1_3(neon_qshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  325 +DEF_HELPER_1_3(neon_qshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  326 +DEF_HELPER_1_3(neon_qshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  327 +DEF_HELPER_1_3(neon_qshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  328 +DEF_HELPER_1_3(neon_qshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  329 +DEF_HELPER_1_3(neon_qrshl_u8, uint32_t, (CPUState *, uint32_t, uint32_t))
  330 +DEF_HELPER_1_3(neon_qrshl_s8, uint32_t, (CPUState *, uint32_t, uint32_t))
  331 +DEF_HELPER_1_3(neon_qrshl_u16, uint32_t, (CPUState *, uint32_t, uint32_t))
  332 +DEF_HELPER_1_3(neon_qrshl_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  333 +DEF_HELPER_1_3(neon_qrshl_u32, uint32_t, (CPUState *, uint32_t, uint32_t))
  334 +DEF_HELPER_1_3(neon_qrshl_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  335 +DEF_HELPER_1_3(neon_qrshl_u64, uint64_t, (CPUState *, uint64_t, uint64_t))
  336 +DEF_HELPER_1_3(neon_qrshl_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  337 +
  338 +DEF_HELPER_1_2(neon_add_u8, uint32_t, (uint32_t, uint32_t))
  339 +DEF_HELPER_1_2(neon_add_u16, uint32_t, (uint32_t, uint32_t))
  340 +DEF_HELPER_1_2(neon_padd_u8, uint32_t, (uint32_t, uint32_t))
  341 +DEF_HELPER_1_2(neon_padd_u16, uint32_t, (uint32_t, uint32_t))
  342 +DEF_HELPER_1_2(neon_sub_u8, uint32_t, (uint32_t, uint32_t))
  343 +DEF_HELPER_1_2(neon_sub_u16, uint32_t, (uint32_t, uint32_t))
  344 +DEF_HELPER_1_2(neon_mul_u8, uint32_t, (uint32_t, uint32_t))
  345 +DEF_HELPER_1_2(neon_mul_u16, uint32_t, (uint32_t, uint32_t))
  346 +DEF_HELPER_1_2(neon_mul_p8, uint32_t, (uint32_t, uint32_t))
  347 +
  348 +DEF_HELPER_1_2(neon_tst_u8, uint32_t, (uint32_t, uint32_t))
  349 +DEF_HELPER_1_2(neon_tst_u16, uint32_t, (uint32_t, uint32_t))
  350 +DEF_HELPER_1_2(neon_tst_u32, uint32_t, (uint32_t, uint32_t))
  351 +DEF_HELPER_1_2(neon_ceq_u8, uint32_t, (uint32_t, uint32_t))
  352 +DEF_HELPER_1_2(neon_ceq_u16, uint32_t, (uint32_t, uint32_t))
  353 +DEF_HELPER_1_2(neon_ceq_u32, uint32_t, (uint32_t, uint32_t))
  354 +
  355 +DEF_HELPER_1_1(neon_abs_s8, uint32_t, (uint32_t))
  356 +DEF_HELPER_1_1(neon_abs_s16, uint32_t, (uint32_t))
  357 +DEF_HELPER_1_1(neon_clz_u8, uint32_t, (uint32_t))
  358 +DEF_HELPER_1_1(neon_clz_u16, uint32_t, (uint32_t))
  359 +DEF_HELPER_1_1(neon_cls_s8, uint32_t, (uint32_t))
  360 +DEF_HELPER_1_1(neon_cls_s16, uint32_t, (uint32_t))
  361 +DEF_HELPER_1_1(neon_cls_s32, uint32_t, (uint32_t))
  362 +DEF_HELPER_1_1(neon_cnt_u8, uint32_t, (uint32_t))
  363 +
  364 +DEF_HELPER_1_3(neon_qdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  365 +DEF_HELPER_1_3(neon_qrdmulh_s16, uint32_t, (CPUState *, uint32_t, uint32_t))
  366 +DEF_HELPER_1_3(neon_qdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  367 +DEF_HELPER_1_3(neon_qrdmulh_s32, uint32_t, (CPUState *, uint32_t, uint32_t))
  368 +
  369 +DEF_HELPER_1_1(neon_narrow_u8, uint32_t, (uint64_t))
  370 +DEF_HELPER_1_1(neon_narrow_u16, uint32_t, (uint64_t))
  371 +DEF_HELPER_1_2(neon_narrow_sat_u8, uint32_t, (CPUState *, uint64_t))
  372 +DEF_HELPER_1_2(neon_narrow_sat_s8, uint32_t, (CPUState *, uint64_t))
  373 +DEF_HELPER_1_2(neon_narrow_sat_u16, uint32_t, (CPUState *, uint64_t))
  374 +DEF_HELPER_1_2(neon_narrow_sat_s16, uint32_t, (CPUState *, uint64_t))
  375 +DEF_HELPER_1_2(neon_narrow_sat_u32, uint32_t, (CPUState *, uint64_t))
  376 +DEF_HELPER_1_2(neon_narrow_sat_s32, uint32_t, (CPUState *, uint64_t))
  377 +DEF_HELPER_1_1(neon_narrow_high_u8, uint32_t, (uint64_t))
  378 +DEF_HELPER_1_1(neon_narrow_high_u16, uint32_t, (uint64_t))
  379 +DEF_HELPER_1_1(neon_narrow_round_high_u8, uint32_t, (uint64_t))
  380 +DEF_HELPER_1_1(neon_narrow_round_high_u16, uint32_t, (uint64_t))
  381 +DEF_HELPER_1_1(neon_widen_u8, uint64_t, (uint32_t))
  382 +DEF_HELPER_1_1(neon_widen_s8, uint64_t, (uint32_t))
  383 +DEF_HELPER_1_1(neon_widen_u16, uint64_t, (uint32_t))
  384 +DEF_HELPER_1_1(neon_widen_s16, uint64_t, (uint32_t))
  385 +
  386 +DEF_HELPER_1_2(neon_addl_u16, uint64_t, (uint64_t, uint64_t))
  387 +DEF_HELPER_1_2(neon_addl_u32, uint64_t, (uint64_t, uint64_t))
  388 +DEF_HELPER_1_2(neon_paddl_u16, uint64_t, (uint64_t, uint64_t))
  389 +DEF_HELPER_1_2(neon_paddl_u32, uint64_t, (uint64_t, uint64_t))
  390 +DEF_HELPER_1_2(neon_subl_u16, uint64_t, (uint64_t, uint64_t))
  391 +DEF_HELPER_1_2(neon_subl_u32, uint64_t, (uint64_t, uint64_t))
  392 +DEF_HELPER_1_3(neon_addl_saturate_s32, uint64_t, (CPUState *, uint64_t, uint64_t))
  393 +DEF_HELPER_1_3(neon_addl_saturate_s64, uint64_t, (CPUState *, uint64_t, uint64_t))
  394 +DEF_HELPER_1_2(neon_abdl_u16, uint64_t, (uint32_t, uint32_t))
  395 +DEF_HELPER_1_2(neon_abdl_s16, uint64_t, (uint32_t, uint32_t))
  396 +DEF_HELPER_1_2(neon_abdl_u32, uint64_t, (uint32_t, uint32_t))
  397 +DEF_HELPER_1_2(neon_abdl_s32, uint64_t, (uint32_t, uint32_t))
  398 +DEF_HELPER_1_2(neon_abdl_u64, uint64_t, (uint32_t, uint32_t))
  399 +DEF_HELPER_1_2(neon_abdl_s64, uint64_t, (uint32_t, uint32_t))
  400 +DEF_HELPER_1_2(neon_mull_u8, uint64_t, (uint32_t, uint32_t))
  401 +DEF_HELPER_1_2(neon_mull_s8, uint64_t, (uint32_t, uint32_t))
  402 +DEF_HELPER_1_2(neon_mull_u16, uint64_t, (uint32_t, uint32_t))
  403 +DEF_HELPER_1_2(neon_mull_s16, uint64_t, (uint32_t, uint32_t))
  404 +
  405 +DEF_HELPER_1_1(neon_negl_u16, uint64_t, (uint64_t))
  406 +DEF_HELPER_1_1(neon_negl_u32, uint64_t, (uint64_t))
  407 +DEF_HELPER_1_1(neon_negl_u64, uint64_t, (uint64_t))
  408 +
  409 +DEF_HELPER_1_2(neon_qabs_s8, uint32_t, (CPUState *, uint32_t))
  410 +DEF_HELPER_1_2(neon_qabs_s16, uint32_t, (CPUState *, uint32_t))
  411 +DEF_HELPER_1_2(neon_qabs_s32, uint32_t, (CPUState *, uint32_t))
  412 +DEF_HELPER_1_2(neon_qneg_s8, uint32_t, (CPUState *, uint32_t))
  413 +DEF_HELPER_1_2(neon_qneg_s16, uint32_t, (CPUState *, uint32_t))
  414 +DEF_HELPER_1_2(neon_qneg_s32, uint32_t, (CPUState *, uint32_t))
  415 +
  416 +DEF_HELPER_0_0(neon_trn_u8, void, (void))
  417 +DEF_HELPER_0_0(neon_trn_u16, void, (void))
  418 +DEF_HELPER_0_0(neon_unzip_u8, void, (void))
  419 +DEF_HELPER_0_0(neon_zip_u8, void, (void))
  420 +DEF_HELPER_0_0(neon_zip_u16, void, (void))
  421 +
  422 +DEF_HELPER_1_2(neon_min_f32, uint32_t, (uint32_t, uint32_t))
  423 +DEF_HELPER_1_2(neon_max_f32, uint32_t, (uint32_t, uint32_t))
  424 +DEF_HELPER_1_2(neon_abd_f32, uint32_t, (uint32_t, uint32_t))
  425 +DEF_HELPER_1_2(neon_add_f32, uint32_t, (uint32_t, uint32_t))
  426 +DEF_HELPER_1_2(neon_sub_f32, uint32_t, (uint32_t, uint32_t))
  427 +DEF_HELPER_1_2(neon_mul_f32, uint32_t, (uint32_t, uint32_t))
  428 +DEF_HELPER_1_2(neon_ceq_f32, uint32_t, (uint32_t, uint32_t))
  429 +DEF_HELPER_1_2(neon_cge_f32, uint32_t, (uint32_t, uint32_t))
  430 +DEF_HELPER_1_2(neon_cgt_f32, uint32_t, (uint32_t, uint32_t))
  431 +DEF_HELPER_1_2(neon_acge_f32, uint32_t, (uint32_t, uint32_t))
  432 +DEF_HELPER_1_2(neon_acgt_f32, uint32_t, (uint32_t, uint32_t))
  433 +
226 #undef DEF_HELPER 434 #undef DEF_HELPER
227 #undef DEF_HELPER_0_0 435 #undef DEF_HELPER_0_0
228 #undef DEF_HELPER_0_1 436 #undef DEF_HELPER_0_1
target-arm/neon_helper.c 0 โ†’ 100644
  1 +#include <stdlib.h>
  2 +#include <stdio.h>
  3 +
  4 +#include "cpu.h"
  5 +#include "exec-all.h"
  6 +#include "helpers.h"
  7 +
  8 +#define SIGNBIT (uint32_t)0x80000000
  9 +#define SIGNBIT64 ((uint64_t)1 << 63)
  10 +
  11 +#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
  12 +
  13 +static float_status neon_float_status;
  14 +#define NFS &neon_float_status
  15 +
  16 +/* Helper routines to perform bitwise copies between float and int. */
  17 +static inline float32 vfp_itos(uint32_t i)
  18 +{
  19 + union {
  20 + uint32_t i;
  21 + float32 s;
  22 + } v;
  23 +
  24 + v.i = i;
  25 + return v.s;
  26 +}
  27 +
  28 +static inline uint32_t vfp_stoi(float32 s)
  29 +{
  30 + union {
  31 + uint32_t i;
  32 + float32 s;
  33 + } v;
  34 +
  35 + v.s = s;
  36 + return v.i;
  37 +}
  38 +
  39 +#define NEON_TYPE1(name, type) \
  40 +typedef struct \
  41 +{ \
  42 + type v1; \
  43 +} neon_##name;
  44 +#ifdef WORDS_BIGENDIAN
  45 +#define NEON_TYPE2(name, type) \
  46 +typedef struct \
  47 +{ \
  48 + type v2; \
  49 + type v1; \
  50 +} neon_##name;
  51 +#define NEON_TYPE4(name, type) \
  52 +typedef struct \
  53 +{ \
  54 + type v4; \
  55 + type v3; \
  56 + type v2; \
  57 + type v1; \
  58 +} neon_##name;
  59 +#else
  60 +#define NEON_TYPE2(name, type) \
  61 +typedef struct \
  62 +{ \
  63 + type v1; \
  64 + type v2; \
  65 +} neon_##name;
  66 +#define NEON_TYPE4(name, type) \
  67 +typedef struct \
  68 +{ \
  69 + type v1; \
  70 + type v2; \
  71 + type v3; \
  72 + type v4; \
  73 +} neon_##name;
  74 +#endif
  75 +
  76 +NEON_TYPE4(s8, int8_t)
  77 +NEON_TYPE4(u8, uint8_t)
  78 +NEON_TYPE2(s16, int16_t)
  79 +NEON_TYPE2(u16, uint16_t)
  80 +NEON_TYPE1(s32, int32_t)
  81 +NEON_TYPE1(u32, uint32_t)
  82 +#undef NEON_TYPE4
  83 +#undef NEON_TYPE2
  84 +#undef NEON_TYPE1
  85 +
  86 +/* Copy from a uint32_t to a vector structure type. */
  87 +#define NEON_UNPACK(vtype, dest, val) do { \
  88 + union { \
  89 + vtype v; \
  90 + uint32_t i; \
  91 + } conv_u; \
  92 + conv_u.i = (val); \
  93 + dest = conv_u.v; \
  94 + } while(0)
  95 +
  96 +/* Copy from a vector structure type to a uint32_t. */
  97 +#define NEON_PACK(vtype, dest, val) do { \
  98 + union { \
  99 + vtype v; \
  100 + uint32_t i; \
  101 + } conv_u; \
  102 + conv_u.v = (val); \
  103 + dest = conv_u.i; \
  104 + } while(0)
  105 +
  106 +#define NEON_DO1 \
  107 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
  108 +#define NEON_DO2 \
  109 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  110 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
  111 +#define NEON_DO4 \
  112 + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  113 + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
  114 + NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
  115 + NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
  116 +
  117 +#define NEON_VOP_BODY(vtype, n) \
  118 +{ \
  119 + uint32_t res; \
  120 + vtype vsrc1; \
  121 + vtype vsrc2; \
  122 + vtype vdest; \
  123 + NEON_UNPACK(vtype, vsrc1, arg1); \
  124 + NEON_UNPACK(vtype, vsrc2, arg2); \
  125 + NEON_DO##n; \
  126 + NEON_PACK(vtype, res, vdest); \
  127 + return res; \
  128 +}
  129 +
  130 +#define NEON_VOP(name, vtype, n) \
  131 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  132 +NEON_VOP_BODY(vtype, n)
  133 +
  134 +#define NEON_VOP_ENV(name, vtype, n) \
  135 +uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
  136 +NEON_VOP_BODY(vtype, n)
  137 +
  138 +/* Pairwise operations. */
  139 +/* For 32-bit elements each segment only contains a single element, so
  140 + the elementwise and pairwise operations are the same. */
  141 +#define NEON_PDO2 \
  142 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  143 + NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
  144 +#define NEON_PDO4 \
  145 + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
  146 + NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
  147 + NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
  148 + NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
  149 +
  150 +#define NEON_POP(name, vtype, n) \
  151 +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
  152 +{ \
  153 + uint32_t res; \
  154 + vtype vsrc1; \
  155 + vtype vsrc2; \
  156 + vtype vdest; \
  157 + NEON_UNPACK(vtype, vsrc1, arg1); \
  158 + NEON_UNPACK(vtype, vsrc2, arg2); \
  159 + NEON_PDO##n; \
  160 + NEON_PACK(vtype, res, vdest); \
  161 + return res; \
  162 +}
  163 +
  164 +/* Unary operators. */
  165 +#define NEON_VOP1(name, vtype, n) \
  166 +uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
  167 +{ \
  168 + vtype vsrc1; \
  169 + vtype vdest; \
  170 + NEON_UNPACK(vtype, vsrc1, arg); \
  171 + NEON_DO##n; \
  172 + NEON_PACK(vtype, arg, vdest); \
  173 + return arg; \
  174 +}
  175 +
  176 +
  177 +#define NEON_USAT(dest, src1, src2, type) do { \
  178 + uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  179 + if (tmp != (type)tmp) { \
  180 + SET_QC(); \
  181 + dest = ~0; \
  182 + } else { \
  183 + dest = tmp; \
  184 + }} while(0)
  185 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  186 +NEON_VOP_ENV(qadd_u8, neon_u8, 4)
  187 +#undef NEON_FN
  188 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  189 +NEON_VOP_ENV(qadd_u16, neon_u16, 2)
  190 +#undef NEON_FN
  191 +#undef NEON_USAT
  192 +
  193 +#define NEON_SSAT(dest, src1, src2, type) do { \
  194 + int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
  195 + if (tmp != (type)tmp) { \
  196 + SET_QC(); \
  197 + if (src2 > 0) { \
  198 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  199 + } else { \
  200 + tmp = 1 << (sizeof(type) * 8 - 1); \
  201 + } \
  202 + } \
  203 + dest = tmp; \
  204 + } while(0)
  205 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  206 +NEON_VOP_ENV(qadd_s8, neon_s8, 4)
  207 +#undef NEON_FN
  208 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  209 +NEON_VOP_ENV(qadd_s16, neon_s16, 2)
  210 +#undef NEON_FN
  211 +#undef NEON_SSAT
  212 +
  213 +#define NEON_USAT(dest, src1, src2, type) do { \
  214 + uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  215 + if (tmp != (type)tmp) { \
  216 + SET_QC(); \
  217 + dest = 0; \
  218 + } else { \
  219 + dest = tmp; \
  220 + }} while(0)
  221 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
  222 +NEON_VOP_ENV(qsub_u8, neon_u8, 4)
  223 +#undef NEON_FN
  224 +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
  225 +NEON_VOP_ENV(qsub_u16, neon_u16, 2)
  226 +#undef NEON_FN
  227 +#undef NEON_USAT
  228 +
  229 +#define NEON_SSAT(dest, src1, src2, type) do { \
  230 + int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
  231 + if (tmp != (type)tmp) { \
  232 + SET_QC(); \
  233 + if (src2 < 0) { \
  234 + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
  235 + } else { \
  236 + tmp = 1 << (sizeof(type) * 8 - 1); \
  237 + } \
  238 + } \
  239 + dest = tmp; \
  240 + } while(0)
  241 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
  242 +NEON_VOP_ENV(qsub_s8, neon_s8, 4)
  243 +#undef NEON_FN
  244 +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
  245 +NEON_VOP_ENV(qsub_s16, neon_s16, 2)
  246 +#undef NEON_FN
  247 +#undef NEON_SSAT
  248 +
  249 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
  250 +NEON_VOP(hadd_s8, neon_s8, 4)
  251 +NEON_VOP(hadd_u8, neon_u8, 4)
  252 +NEON_VOP(hadd_s16, neon_s16, 2)
  253 +NEON_VOP(hadd_u16, neon_u16, 2)
  254 +#undef NEON_FN
  255 +
  256 +int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
  257 +{
  258 + int32_t dest;
  259 +
  260 + dest = (src1 >> 1) + (src2 >> 1);
  261 + if (src1 & src2 & 1)
  262 + dest++;
  263 + return dest;
  264 +}
  265 +
  266 +uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
  267 +{
  268 + uint32_t dest;
  269 +
  270 + dest = (src1 >> 1) + (src2 >> 1);
  271 + if (src1 & src2 & 1)
  272 + dest++;
  273 + return dest;
  274 +}
  275 +
  276 +#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
  277 +NEON_VOP(rhadd_s8, neon_s8, 4)
  278 +NEON_VOP(rhadd_u8, neon_u8, 4)
  279 +NEON_VOP(rhadd_s16, neon_s16, 2)
  280 +NEON_VOP(rhadd_u16, neon_u16, 2)
  281 +#undef NEON_FN
  282 +
  283 +int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
  284 +{
  285 + int32_t dest;
  286 +
  287 + dest = (src1 >> 1) + (src2 >> 1);
  288 + if ((src1 | src2) & 1)
  289 + dest++;
  290 + return dest;
  291 +}
  292 +
  293 +uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
  294 +{
  295 + uint32_t dest;
  296 +
  297 + dest = (src1 >> 1) + (src2 >> 1);
  298 + if ((src1 | src2) & 1)
  299 + dest++;
  300 + return dest;
  301 +}
  302 +
  303 +#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
  304 +NEON_VOP(hsub_s8, neon_s8, 4)
  305 +NEON_VOP(hsub_u8, neon_u8, 4)
  306 +NEON_VOP(hsub_s16, neon_s16, 2)
  307 +NEON_VOP(hsub_u16, neon_u16, 2)
  308 +#undef NEON_FN
  309 +
  310 +int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
  311 +{
  312 + int32_t dest;
  313 +
  314 + dest = (src1 >> 1) - (src2 >> 1);
  315 + if ((~src1) & src2 & 1)
  316 + dest--;
  317 + return dest;
  318 +}
  319 +
  320 +uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
  321 +{
  322 + uint32_t dest;
  323 +
  324 + dest = (src1 >> 1) - (src2 >> 1);
  325 + if ((~src1) & src2 & 1)
  326 + dest--;
  327 + return dest;
  328 +}
  329 +
  330 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
  331 +NEON_VOP(cgt_s8, neon_s8, 4)
  332 +NEON_VOP(cgt_u8, neon_u8, 4)
  333 +NEON_VOP(cgt_s16, neon_s16, 2)
  334 +NEON_VOP(cgt_u16, neon_u16, 2)
  335 +NEON_VOP(cgt_s32, neon_s32, 1)
  336 +NEON_VOP(cgt_u32, neon_u32, 1)
  337 +#undef NEON_FN
  338 +
  339 +#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
  340 +NEON_VOP(cge_s8, neon_s8, 4)
  341 +NEON_VOP(cge_u8, neon_u8, 4)
  342 +NEON_VOP(cge_s16, neon_s16, 2)
  343 +NEON_VOP(cge_u16, neon_u16, 2)
  344 +NEON_VOP(cge_s32, neon_s32, 1)
  345 +NEON_VOP(cge_u32, neon_u32, 1)
  346 +#undef NEON_FN
  347 +
  348 +#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
  349 +NEON_VOP(min_s8, neon_s8, 4)
  350 +NEON_VOP(min_u8, neon_u8, 4)
  351 +NEON_VOP(min_s16, neon_s16, 2)
  352 +NEON_VOP(min_u16, neon_u16, 2)
  353 +NEON_VOP(min_s32, neon_s32, 1)
  354 +NEON_VOP(min_u32, neon_u32, 1)
  355 +NEON_POP(pmin_s8, neon_s8, 4)
  356 +NEON_POP(pmin_u8, neon_u8, 4)
  357 +NEON_POP(pmin_s16, neon_s16, 2)
  358 +NEON_POP(pmin_u16, neon_u16, 2)
  359 +#undef NEON_FN
  360 +
  361 +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
  362 +NEON_VOP(max_s8, neon_s8, 4)
  363 +NEON_VOP(max_u8, neon_u8, 4)
  364 +NEON_VOP(max_s16, neon_s16, 2)
  365 +NEON_VOP(max_u16, neon_u16, 2)
  366 +NEON_VOP(max_s32, neon_s32, 1)
  367 +NEON_VOP(max_u32, neon_u32, 1)
  368 +NEON_POP(pmax_s8, neon_s8, 4)
  369 +NEON_POP(pmax_u8, neon_u8, 4)
  370 +NEON_POP(pmax_s16, neon_s16, 2)
  371 +NEON_POP(pmax_u16, neon_u16, 2)
  372 +#undef NEON_FN
  373 +
  374 +#define NEON_FN(dest, src1, src2) \
  375 + dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
  376 +NEON_VOP(abd_s8, neon_s8, 4)
  377 +NEON_VOP(abd_u8, neon_u8, 4)
  378 +NEON_VOP(abd_s16, neon_s16, 2)
  379 +NEON_VOP(abd_u16, neon_u16, 2)
  380 +NEON_VOP(abd_s32, neon_s32, 1)
  381 +NEON_VOP(abd_u32, neon_u32, 1)
  382 +#undef NEON_FN
  383 +
  384 +#define NEON_FN(dest, src1, src2) do { \
  385 + int8_t tmp; \
  386 + tmp = (int8_t)src2; \
  387 + if (tmp >= sizeof(src1) * 8 || tmp <= -sizeof(src1) * 8) { \
  388 + dest = 0; \
  389 + } else if (tmp < 0) { \
  390 + dest = src1 >> -tmp; \
  391 + } else { \
  392 + dest = src1 << tmp; \
  393 + }} while (0)
  394 +NEON_VOP(shl_u8, neon_u8, 4)
  395 +NEON_VOP(shl_u16, neon_u16, 2)
  396 +NEON_VOP(shl_u32, neon_u32, 1)
  397 +#undef NEON_FN
  398 +
  399 +uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
  400 +{
  401 + int8_t shift = (int8_t)shiftop;
  402 + if (shift >= 64 || shift <= -64) {
  403 + val = 0;
  404 + } else if (shift < 0) {
  405 + val >>= -shift;
  406 + } else {
  407 + val <<= shift;
  408 + }
  409 + return val;
  410 +}
  411 +
  412 +#define NEON_FN(dest, src1, src2) do { \
  413 + int8_t tmp; \
  414 + tmp = (int8_t)src2; \
  415 + if (tmp >= sizeof(src1) * 8) { \
  416 + dest = 0; \
  417 + } else if (tmp <= -sizeof(src1) * 8) { \
  418 + dest = src1 >> (sizeof(src1) * 8 - 1); \
  419 + } else if (tmp < 0) { \
  420 + dest = src1 >> -tmp; \
  421 + } else { \
  422 + dest = src1 << tmp; \
  423 + }} while (0)
  424 +NEON_VOP(shl_s8, neon_s8, 4)
  425 +NEON_VOP(shl_s16, neon_s16, 2)
  426 +NEON_VOP(shl_s32, neon_s32, 1)
  427 +#undef NEON_FN
  428 +
  429 +uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
  430 +{
  431 + int8_t shift = (int8_t)shiftop;
  432 + int64_t val = valop;
  433 + if (shift >= 64) {
  434 + val = 0;
  435 + } else if (shift <= -64) {
  436 + val >>= 63;
  437 + } else if (shift < 0) {
  438 + val >>= -shift;
  439 + } else {
  440 + val <<= shift;
  441 + }
  442 + return val;
  443 +}
  444 +
  445 +#define NEON_FN(dest, src1, src2) do { \
  446 + int8_t tmp; \
  447 + tmp = (int8_t)src2; \
  448 + if (tmp >= sizeof(src1) * 8) { \
  449 + dest = 0; \
  450 + } else if (tmp < -sizeof(src1) * 8) { \
  451 + dest >>= sizeof(src1) * 8 - 1; \
  452 + } else if (tmp == -sizeof(src1) * 8) { \
  453 + dest = src1 >> (tmp - 1); \
  454 + dest++; \
  455 + src2 >>= 1; \
  456 + } else if (tmp < 0) { \
  457 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  458 + } else { \
  459 + dest = src1 << tmp; \
  460 + }} while (0)
  461 +NEON_VOP(rshl_s8, neon_s8, 4)
  462 +NEON_VOP(rshl_s16, neon_s16, 2)
  463 +NEON_VOP(rshl_s32, neon_s32, 1)
  464 +#undef NEON_FN
  465 +
  466 +uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
  467 +{
  468 + int8_t shift = (int8_t)shiftop;
  469 + int64_t val = valop;
  470 + if (shift >= 64) {
  471 + val = 0;
  472 + } else if (shift < -64) {
  473 + val >>= 63;
  474 + } else if (shift == -63) {
  475 + val >>= 63;
  476 + val++;
  477 + val >>= 1;
  478 + } else if (shift < 0) {
  479 + val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
  480 + } else {
  481 + val <<= shift;
  482 + }
  483 + return val;
  484 +}
  485 +
  486 +#define NEON_FN(dest, src1, src2) do { \
  487 + int8_t tmp; \
  488 + tmp = (int8_t)src2; \
  489 + if (tmp >= sizeof(src1) * 8 || tmp < -sizeof(src1) * 8) { \
  490 + dest = 0; \
  491 + } else if (tmp == -sizeof(src1) * 8) { \
  492 + dest = src1 >> (tmp - 1); \
  493 + } else if (tmp < 0) { \
  494 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  495 + } else { \
  496 + dest = src1 << tmp; \
  497 + }} while (0)
  498 +NEON_VOP(rshl_u8, neon_u8, 4)
  499 +NEON_VOP(rshl_u16, neon_u16, 2)
  500 +NEON_VOP(rshl_u32, neon_u32, 1)
  501 +#undef NEON_FN
  502 +
  503 +uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
  504 +{
  505 + int8_t shift = (uint8_t)shiftop;
  506 + if (shift >= 64 || shift < 64) {
  507 + val = 0;
  508 + } else if (shift == -64) {
  509 + /* Rounding a 1-bit result just preserves that bit. */
  510 + val >>= 63;
  511 + } if (shift < 0) {
  512 + val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
  513 + val >>= -shift;
  514 + } else {
  515 + val <<= shift;
  516 + }
  517 + return val;
  518 +}
  519 +
  520 +#define NEON_FN(dest, src1, src2) do { \
  521 + int8_t tmp; \
  522 + tmp = (int8_t)src2; \
  523 + if (tmp >= sizeof(src1) * 8) { \
  524 + if (src1) { \
  525 + SET_QC(); \
  526 + dest = ~0; \
  527 + } else { \
  528 + dest = 0; \
  529 + } \
  530 + } else if (tmp <= -sizeof(src1) * 8) { \
  531 + dest = 0; \
  532 + } else if (tmp < 0) { \
  533 + dest = src1 >> -tmp; \
  534 + } else { \
  535 + dest = src1 << tmp; \
  536 + if ((dest >> tmp) != src1) { \
  537 + SET_QC(); \
  538 + dest = ~0; \
  539 + } \
  540 + }} while (0)
  541 +NEON_VOP_ENV(qshl_u8, neon_u8, 4)
  542 +NEON_VOP_ENV(qshl_u16, neon_u16, 2)
  543 +NEON_VOP_ENV(qshl_u32, neon_u32, 1)
  544 +#undef NEON_FN
  545 +
  546 +uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  547 +{
  548 + int8_t shift = (int8_t)shiftop;
  549 + if (shift >= 64) {
  550 + if (val) {
  551 + val = ~(uint64_t)0;
  552 + SET_QC();
  553 + } else {
  554 + val = 0;
  555 + }
  556 + } else if (shift <= -64) {
  557 + val = 0;
  558 + } else if (shift < 0) {
  559 + val >>= -shift;
  560 + } else {
  561 + uint64_t tmp = val;
  562 + val <<= shift;
  563 + if ((val >> shift) != tmp) {
  564 + SET_QC();
  565 + val = ~(uint64_t)0;
  566 + }
  567 + }
  568 + return val;
  569 +}
  570 +
  571 +#define NEON_FN(dest, src1, src2) do { \
  572 + int8_t tmp; \
  573 + tmp = (int8_t)src2; \
  574 + if (tmp >= sizeof(src1) * 8) { \
  575 + if (src1) \
  576 + SET_QC(); \
  577 + dest = src1 >> 31; \
  578 + } else if (tmp <= -sizeof(src1) * 8) { \
  579 + dest = src1 >> 31; \
  580 + } else if (tmp < 0) { \
  581 + dest = src1 >> -tmp; \
  582 + } else { \
  583 + dest = src1 << tmp; \
  584 + if ((dest >> tmp) != src1) { \
  585 + SET_QC(); \
  586 + dest = src2 >> 31; \
  587 + } \
  588 + }} while (0)
  589 +NEON_VOP_ENV(qshl_s8, neon_s8, 4)
  590 +NEON_VOP_ENV(qshl_s16, neon_s16, 2)
  591 +NEON_VOP_ENV(qshl_s32, neon_s32, 1)
  592 +#undef NEON_FN
  593 +
  594 +uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  595 +{
  596 + int8_t shift = (uint8_t)shiftop;
  597 + int64_t val = valop;
  598 + if (shift >= 64) {
  599 + if (val) {
  600 + SET_QC();
  601 + val = (val >> 63) & ~SIGNBIT64;
  602 + }
  603 + } else if (shift <= 64) {
  604 + val >>= 63;
  605 + } else if (shift < 0) {
  606 + val >>= -shift;
  607 + } else {
  608 + int64_t tmp = val;
  609 + val <<= shift;
  610 + if ((val >> shift) != tmp) {
  611 + SET_QC();
  612 + val = (tmp >> 63) ^ ~SIGNBIT64;
  613 + }
  614 + }
  615 + return val;
  616 +}
  617 +
  618 +
  619 +/* FIXME: This is wrong. */
  620 +#define NEON_FN(dest, src1, src2) do { \
  621 + int8_t tmp; \
  622 + tmp = (int8_t)src2; \
  623 + if (tmp < 0) { \
  624 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  625 + } else { \
  626 + dest = src1 << tmp; \
  627 + if ((dest >> tmp) != src1) { \
  628 + SET_QC(); \
  629 + dest = ~0; \
  630 + } \
  631 + }} while (0)
  632 +NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
  633 +NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
  634 +NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
  635 +#undef NEON_FN
  636 +
  637 +uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
  638 +{
  639 + int8_t shift = (int8_t)shiftop;
  640 + if (shift < 0) {
  641 + val = (val + (1 << (-1 - shift))) >> -shift;
  642 + } else { \
  643 + uint64_t tmp = val;
  644 + val <<= shift;
  645 + if ((val >> shift) != tmp) {
  646 + SET_QC();
  647 + val = ~0;
  648 + }
  649 + }
  650 + return val;
  651 +}
  652 +
  653 +#define NEON_FN(dest, src1, src2) do { \
  654 + int8_t tmp; \
  655 + tmp = (int8_t)src2; \
  656 + if (tmp < 0) { \
  657 + dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
  658 + } else { \
  659 + dest = src1 << tmp; \
  660 + if ((dest >> tmp) != src1) { \
  661 + SET_QC(); \
  662 + dest = src1 >> 31; \
  663 + } \
  664 + }} while (0)
  665 +NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
  666 +NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
  667 +NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
  668 +#undef NEON_FN
  669 +
  670 +uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
  671 +{
  672 + int8_t shift = (uint8_t)shiftop;
  673 + int64_t val = valop;
  674 +
  675 + if (shift < 0) {
  676 + val = (val + (1 << (-1 - shift))) >> -shift;
  677 + } else {
  678 + int64_t tmp = val;;
  679 + val <<= shift;
  680 + if ((val >> shift) != tmp) {
  681 + SET_QC();
  682 + val = tmp >> 31;
  683 + }
  684 + }
  685 + return val;
  686 +}
  687 +
  688 +uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
  689 +{
  690 + uint32_t mask;
  691 + mask = (a ^ b) & 0x80808080u;
  692 + a &= ~0x80808080u;
  693 + b &= ~0x80808080u;
  694 + return (a + b) ^ mask;
  695 +}
  696 +
  697 +uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
  698 +{
  699 + uint32_t mask;
  700 + mask = (a ^ b) & 0x80008000u;
  701 + a &= ~0x80008000u;
  702 + b &= ~0x80008000u;
  703 + return (a + b) ^ mask;
  704 +}
  705 +
  706 +#define NEON_FN(dest, src1, src2) dest = src1 + src2
  707 +NEON_POP(padd_u8, neon_u8, 4)
  708 +NEON_POP(padd_u16, neon_u16, 2)
  709 +#undef NEON_FN
  710 +
  711 +#define NEON_FN(dest, src1, src2) dest = src1 - src2
  712 +NEON_VOP(sub_u8, neon_u8, 4)
  713 +NEON_VOP(sub_u16, neon_u16, 2)
  714 +#undef NEON_FN
  715 +
  716 +#define NEON_FN(dest, src1, src2) dest = src1 * src2
  717 +NEON_VOP(mul_u8, neon_u8, 4)
  718 +NEON_VOP(mul_u16, neon_u16, 2)
  719 +#undef NEON_FN
  720 +
  721 +/* Polynomial multiplication is like integer multiplcation except the
  722 + partial products are XORed, not added. */
  723 +uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
  724 +{
  725 + uint32_t mask;
  726 + uint32_t result;
  727 + result = 0;
  728 + while (op1) {
  729 + mask = 0;
  730 + if (op1 & 1)
  731 + mask |= 0xff;
  732 + if (op1 & (1 << 8))
  733 + mask |= (0xff << 8);
  734 + if (op1 & (1 << 16))
  735 + mask |= (0xff << 16);
  736 + if (op1 & (1 << 24))
  737 + mask |= (0xff << 24);
  738 + result ^= op2 & mask;
  739 + op1 = (op1 >> 1) & 0x7f7f7f7f;
  740 + op2 = (op2 << 1) & 0xfefefefe;
  741 + }
  742 + return result;
  743 +}
  744 +
  745 +#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
  746 +NEON_VOP(tst_u8, neon_u8, 4)
  747 +NEON_VOP(tst_u16, neon_u16, 2)
  748 +NEON_VOP(tst_u32, neon_u32, 1)
  749 +#undef NEON_FN
  750 +
  751 +#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
  752 +NEON_VOP(ceq_u8, neon_u8, 4)
  753 +NEON_VOP(ceq_u16, neon_u16, 2)
  754 +NEON_VOP(ceq_u32, neon_u32, 1)
  755 +#undef NEON_FN
  756 +
  757 +#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
  758 +NEON_VOP1(abs_s8, neon_s8, 4)
  759 +NEON_VOP1(abs_s16, neon_s16, 2)
  760 +#undef NEON_FN
  761 +
  762 +/* Count Leading Sign/Zero Bits. */
  763 +static inline int do_clz8(uint8_t x)
  764 +{
  765 + int n;
  766 + for (n = 8; x; n--)
  767 + x >>= 1;
  768 + return n;
  769 +}
  770 +
  771 +static inline int do_clz16(uint16_t x)
  772 +{
  773 + int n;
  774 + for (n = 16; x; n--)
  775 + x >>= 1;
  776 + return n;
  777 +}
  778 +
  779 +#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
  780 +NEON_VOP1(clz_u8, neon_u8, 4)
  781 +#undef NEON_FN
  782 +
  783 +#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
  784 +NEON_VOP1(clz_u16, neon_u16, 2)
  785 +#undef NEON_FN
  786 +
  787 +#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
  788 +NEON_VOP1(cls_s8, neon_s8, 4)
  789 +#undef NEON_FN
  790 +
  791 +#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
  792 +NEON_VOP1(cls_s16, neon_s16, 2)
  793 +#undef NEON_FN
  794 +
  795 +uint32_t HELPER(neon_cls_s32)(uint32_t x)
  796 +{
  797 + int count;
  798 + if ((int32_t)x < 0)
  799 + x = ~x;
  800 + for (count = 32; x; count--)
  801 + x = x >> 1;
  802 + return count - 1;
  803 +}
  804 +
  805 +/* Bit count. */
  806 +uint32_t HELPER(neon_cnt_u8)(uint32_t x)
  807 +{
  808 + x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
  809 + x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  810 + x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
  811 + return x;
  812 +}
  813 +
  814 +#define NEON_QDMULH16(dest, src1, src2, round) do { \
  815 + uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
  816 + if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
  817 + SET_QC(); \
  818 + tmp = (tmp >> 31) ^ ~SIGNBIT; \
  819 + } \
  820 + tmp <<= 1; \
  821 + if (round) { \
  822 + int32_t old = tmp; \
  823 + tmp += 1 << 15; \
  824 + if ((int32_t)tmp < old) { \
  825 + SET_QC(); \
  826 + tmp = SIGNBIT - 1; \
  827 + } \
  828 + } \
  829 + dest = tmp >> 16; \
  830 + } while(0)
  831 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
  832 +NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
  833 +#undef NEON_FN
  834 +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
  835 +NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
  836 +#undef NEON_FN
  837 +#undef NEON_QDMULH16
  838 +
  839 +#define NEON_QDMULH32(dest, src1, src2, round) do { \
  840 + uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
  841 + if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
  842 + SET_QC(); \
  843 + tmp = (tmp >> 63) ^ ~SIGNBIT64; \
  844 + } else { \
  845 + tmp <<= 1; \
  846 + } \
  847 + if (round) { \
  848 + int64_t old = tmp; \
  849 + tmp += (int64_t)1 << 31; \
  850 + if ((int64_t)tmp < old) { \
  851 + SET_QC(); \
  852 + tmp = SIGNBIT64 - 1; \
  853 + } \
  854 + } \
  855 + dest = tmp >> 32; \
  856 + } while(0)
  857 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
  858 +NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
  859 +#undef NEON_FN
  860 +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
  861 +NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
  862 +#undef NEON_FN
  863 +#undef NEON_QDMULH32
  864 +
  865 +uint32_t HELPER(neon_narrow_u8)(uint64_t x)
  866 +{
  867 + return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
  868 + | ((x >> 24) & 0xff000000u);
  869 +}
  870 +
  871 +uint32_t HELPER(neon_narrow_u16)(uint64_t x)
  872 +{
  873 + return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
  874 +}
  875 +
  876 +uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
  877 +{
  878 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  879 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  880 +}
  881 +
  882 +uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
  883 +{
  884 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  885 +}
  886 +
  887 +uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
  888 +{
  889 + x &= 0xff80ff80ff80ff80ull;
  890 + x += 0x0080008000800080ull;
  891 + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
  892 + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
  893 +}
  894 +
  895 +uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
  896 +{
  897 + x &= 0xffff8000ffff8000ull;
  898 + x += 0x0000800000008000ull;
  899 + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
  900 +}
  901 +
  902 +uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
  903 +{
  904 + uint16_t s;
  905 + uint8_t d;
  906 + uint32_t res = 0;
  907 +#define SAT8(n) \
  908 + s = x >> n; \
  909 + if (s > 0xff) { \
  910 + d = 0xff; \
  911 + SET_QC(); \
  912 + } else { \
  913 + d = s; \
  914 + } \
  915 + res |= (uint32_t)d << (n / 2);
  916 +
  917 + SAT8(0);
  918 + SAT8(16);
  919 + SAT8(32);
  920 + SAT8(48);
  921 +#undef SAT8
  922 + return res;
  923 +}
  924 +
  925 +uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
  926 +{
  927 + int16_t s;
  928 + uint8_t d;
  929 + uint32_t res = 0;
  930 +#define SAT8(n) \
  931 + s = x >> n; \
  932 + if (s != (int8_t)s) { \
  933 + d = (s >> 15) ^ 0x7f; \
  934 + SET_QC(); \
  935 + } else { \
  936 + d = s; \
  937 + } \
  938 + res |= (uint32_t)d << (n / 2);
  939 +
  940 + SAT8(0);
  941 + SAT8(16);
  942 + SAT8(32);
  943 + SAT8(48);
  944 +#undef SAT8
  945 + return res;
  946 +}
  947 +
  948 +uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
  949 +{
  950 + uint32_t high;
  951 + uint32_t low;
  952 + low = x;
  953 + if (low > 0xffff) {
  954 + low = 0xffff;
  955 + SET_QC();
  956 + }
  957 + high = x >> 32;
  958 + if (high > 0xffff) {
  959 + high = 0xffff;
  960 + SET_QC();
  961 + }
  962 + return low | (high << 16);
  963 +}
  964 +
  965 +uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
  966 +{
  967 + int32_t low;
  968 + int32_t high;
  969 + low = x;
  970 + if (low != (int16_t)low) {
  971 + low = (low >> 31) ^ 0x7fff;
  972 + SET_QC();
  973 + }
  974 + high = x >> 32;
  975 + if (high != (int16_t)high) {
  976 + high = (high >> 31) ^ 0x7fff;
  977 + SET_QC();
  978 + }
  979 + return (uint16_t)low | (high << 16);
  980 +}
  981 +
  982 +uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
  983 +{
  984 + if (x > 0xffffffffu) {
  985 + SET_QC();
  986 + return 0xffffffffu;
  987 + }
  988 + return x;
  989 +}
  990 +
  991 +uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
  992 +{
  993 + if ((int64_t)x != (int32_t)x) {
  994 + SET_QC();
  995 + return (x >> 63) ^ 0x7fffffff;
  996 + }
  997 + return x;
  998 +}
  999 +
  1000 +uint64_t HELPER(neon_widen_u8)(uint32_t x)
  1001 +{
  1002 + uint64_t tmp;
  1003 + uint64_t ret;
  1004 + ret = (uint8_t)x;
  1005 + tmp = (uint8_t)(x >> 8);
  1006 + ret |= tmp << 16;
  1007 + tmp = (uint8_t)(x >> 16);
  1008 + ret |= tmp << 32;
  1009 + tmp = (uint8_t)(x >> 24);
  1010 + ret |= tmp << 48;
  1011 + return ret;
  1012 +}
  1013 +
  1014 +uint64_t HELPER(neon_widen_s8)(uint32_t x)
  1015 +{
  1016 + uint64_t tmp;
  1017 + uint64_t ret;
  1018 + ret = (uint16_t)(int8_t)x;
  1019 + tmp = (uint16_t)(int8_t)(x >> 8);
  1020 + ret |= tmp << 16;
  1021 + tmp = (uint16_t)(int8_t)(x >> 16);
  1022 + ret |= tmp << 32;
  1023 + tmp = (uint16_t)(int8_t)(x >> 24);
  1024 + ret |= tmp << 48;
  1025 + return ret;
  1026 +}
  1027 +
  1028 +uint64_t HELPER(neon_widen_u16)(uint32_t x)
  1029 +{
  1030 + uint64_t high = (uint16_t)(x >> 16);
  1031 + return ((uint16_t)x) | (high << 32);
  1032 +}
  1033 +
  1034 +uint64_t HELPER(neon_widen_s16)(uint32_t x)
  1035 +{
  1036 + uint64_t high = (int16_t)(x >> 16);
  1037 + return ((uint32_t)(int16_t)x) | (high << 32);
  1038 +}
  1039 +
  1040 +uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
  1041 +{
  1042 + uint64_t mask;
  1043 + mask = (a ^ b) & 0x8000800080008000ull;
  1044 + a &= ~0x8000800080008000ull;
  1045 + b &= ~0x8000800080008000ull;
  1046 + return (a + b) ^ mask;
  1047 +}
  1048 +
  1049 +uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
  1050 +{
  1051 + uint64_t mask;
  1052 + mask = (a ^ b) & 0x8000000080000000ull;
  1053 + a &= ~0x8000000080000000ull;
  1054 + b &= ~0x8000000080000000ull;
  1055 + return (a + b) ^ mask;
  1056 +}
  1057 +
  1058 +uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
  1059 +{
  1060 + uint64_t tmp;
  1061 + uint64_t tmp2;
  1062 +
  1063 + tmp = a & 0x0000ffff0000ffffull;
  1064 + tmp += (a >> 16) & 0x0000ffff0000ffffull;
  1065 + tmp2 = b & 0xffff0000ffff0000ull;
  1066 + tmp2 += (b << 16) & 0xffff0000ffff0000ull;
  1067 + return ( tmp & 0xffff)
  1068 + | ((tmp >> 16) & 0xffff0000ull)
  1069 + | ((tmp2 << 16) & 0xffff00000000ull)
  1070 + | ( tmp2 & 0xffff000000000000ull);
  1071 +}
  1072 +
  1073 +uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
  1074 +{
  1075 + uint32_t low = a + (a >> 32);
  1076 + uint32_t high = b + (b >> 32);
  1077 + return low + ((uint64_t)high << 32);
  1078 +}
  1079 +
  1080 +uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
  1081 +{
  1082 + uint64_t mask;
  1083 + mask = (a ^ ~b) & 0x8000800080008000ull;
  1084 + a |= 0x8000800080008000ull;
  1085 + b &= ~0x8000800080008000ull;
  1086 + return (a - b) ^ mask;
  1087 +}
  1088 +
  1089 +uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
  1090 +{
  1091 + uint64_t mask;
  1092 + mask = (a ^ ~b) & 0x8000000080000000ull;
  1093 + a |= 0x8000000080000000ull;
  1094 + b &= ~0x8000000080000000ull;
  1095 + return (a - b) ^ mask;
  1096 +}
  1097 +
  1098 +uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
  1099 +{
  1100 + uint32_t x, y;
  1101 + uint32_t low, high;
  1102 +
  1103 + x = a;
  1104 + y = b;
  1105 + low = x + y;
  1106 + if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1107 + SET_QC();
  1108 + low = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1109 + }
  1110 + x = a >> 32;
  1111 + y = b >> 32;
  1112 + high = x + y;
  1113 + if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
  1114 + SET_QC();
  1115 + high = ((int32_t)x >> 31) ^ ~SIGNBIT;
  1116 + }
  1117 + return low | ((uint64_t)high << 32);
  1118 +}
  1119 +
  1120 +uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
  1121 +{
  1122 + uint64_t result;
  1123 +
  1124 + result = a + b;
  1125 + if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
  1126 + SET_QC();
  1127 + result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
  1128 + }
  1129 + return result;
  1130 +}
  1131 +
  1132 +#define DO_ABD(dest, x, y, type) do { \
  1133 + type tmp_x = x; \
  1134 + type tmp_y = y; \
  1135 + dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
  1136 + } while(0)
  1137 +
  1138 +uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
  1139 +{
  1140 + uint64_t tmp;
  1141 + uint64_t result;
  1142 + DO_ABD(result, a, b, uint8_t);
  1143 + DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
  1144 + result |= tmp << 16;
  1145 + DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
  1146 + result |= tmp << 32;
  1147 + DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
  1148 + result |= tmp << 48;
  1149 + return result;
  1150 +}
  1151 +
  1152 +uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
  1153 +{
  1154 + uint64_t tmp;
  1155 + uint64_t result;
  1156 + DO_ABD(result, a, b, int8_t);
  1157 + DO_ABD(tmp, a >> 8, b >> 8, int8_t);
  1158 + result |= tmp << 16;
  1159 + DO_ABD(tmp, a >> 16, b >> 16, int8_t);
  1160 + result |= tmp << 32;
  1161 + DO_ABD(tmp, a >> 24, b >> 24, int8_t);
  1162 + result |= tmp << 48;
  1163 + return result;
  1164 +}
  1165 +
  1166 +uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
  1167 +{
  1168 + uint64_t tmp;
  1169 + uint64_t result;
  1170 + DO_ABD(result, a, b, uint16_t);
  1171 + DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
  1172 + return result | (tmp << 32);
  1173 +}
  1174 +
  1175 +uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
  1176 +{
  1177 + uint64_t tmp;
  1178 + uint64_t result;
  1179 + DO_ABD(result, a, b, int16_t);
  1180 + DO_ABD(tmp, a >> 16, b >> 16, int16_t);
  1181 + return result | (tmp << 32);
  1182 +}
  1183 +
  1184 +uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
  1185 +{
  1186 + uint64_t result;
  1187 + DO_ABD(result, a, b, uint32_t);
  1188 + return result;
  1189 +}
  1190 +
  1191 +uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
  1192 +{
  1193 + uint64_t result;
  1194 + DO_ABD(result, a, b, int32_t);
  1195 + return result;
  1196 +}
  1197 +#undef DO_ABD
  1198 +
  1199 +/* Widening multiply. Named type is the source type. */
  1200 +#define DO_MULL(dest, x, y, type1, type2) do { \
  1201 + type1 tmp_x = x; \
  1202 + type1 tmp_y = y; \
  1203 + dest = (type2)((type2)tmp_x * (type2)tmp_y); \
  1204 + } while(0)
  1205 +
  1206 +uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
  1207 +{
  1208 + uint64_t tmp;
  1209 + uint64_t result;
  1210 +
  1211 + DO_MULL(result, a, b, uint8_t, uint16_t);
  1212 + DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
  1213 + result |= tmp << 16;
  1214 + DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
  1215 + result |= tmp << 32;
  1216 + DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
  1217 + result |= tmp << 48;
  1218 + return result;
  1219 +}
  1220 +
  1221 +uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
  1222 +{
  1223 + uint64_t tmp;
  1224 + uint64_t result;
  1225 +
  1226 + DO_MULL(result, a, b, int8_t, uint16_t);
  1227 + DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
  1228 + result |= tmp << 16;
  1229 + DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
  1230 + result |= tmp << 32;
  1231 + DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
  1232 + result |= tmp << 48;
  1233 + return result;
  1234 +}
  1235 +
  1236 +uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
  1237 +{
  1238 + uint64_t tmp;
  1239 + uint64_t result;
  1240 +
  1241 + DO_MULL(result, a, b, uint16_t, uint32_t);
  1242 + DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
  1243 + return result | (tmp << 32);
  1244 +}
  1245 +
  1246 +uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
  1247 +{
  1248 + uint64_t tmp;
  1249 + uint64_t result;
  1250 +
  1251 + DO_MULL(result, a, b, int16_t, uint32_t);
  1252 + DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
  1253 + return result | (tmp << 32);
  1254 +}
  1255 +
  1256 +uint64_t HELPER(neon_negl_u16)(uint64_t x)
  1257 +{
  1258 + uint16_t tmp;
  1259 + uint64_t result;
  1260 + result = (uint16_t)-x;
  1261 + tmp = -(x >> 16);
  1262 + result |= (uint64_t)tmp << 16;
  1263 + tmp = -(x >> 32);
  1264 + result |= (uint64_t)tmp << 32;
  1265 + tmp = -(x >> 48);
  1266 + result |= (uint64_t)tmp << 48;
  1267 + return result;
  1268 +}
  1269 +
  1270 +#include <stdio.h>
  1271 +uint64_t HELPER(neon_negl_u32)(uint64_t x)
  1272 +{
  1273 + uint32_t low = -x;
  1274 + uint32_t high = -(x >> 32);
  1275 + return low | ((uint64_t)high << 32);
  1276 +}
  1277 +
  1278 +/* FIXME: There should be a native op for this. */
  1279 +uint64_t HELPER(neon_negl_u64)(uint64_t x)
  1280 +{
  1281 + return -x;
  1282 +}
  1283 +
  1284 +/* Saturnating sign manuipulation. */
  1285 +/* ??? Make these use NEON_VOP1 */
  1286 +#define DO_QABS8(x) do { \
  1287 + if (x == (int8_t)0x80) { \
  1288 + x = 0x7f; \
  1289 + SET_QC(); \
  1290 + } else if (x < 0) { \
  1291 + x = -x; \
  1292 + }} while (0)
  1293 +uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
  1294 +{
  1295 + neon_s8 vec;
  1296 + NEON_UNPACK(neon_s8, vec, x);
  1297 + DO_QABS8(vec.v1);
  1298 + DO_QABS8(vec.v2);
  1299 + DO_QABS8(vec.v3);
  1300 + DO_QABS8(vec.v4);
  1301 + NEON_PACK(neon_s8, x, vec);
  1302 + return x;
  1303 +}
  1304 +#undef DO_QABS8
  1305 +
  1306 +#define DO_QNEG8(x) do { \
  1307 + if (x == (int8_t)0x80) { \
  1308 + x = 0x7f; \
  1309 + SET_QC(); \
  1310 + } else { \
  1311 + x = -x; \
  1312 + }} while (0)
  1313 +uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
  1314 +{
  1315 + neon_s8 vec;
  1316 + NEON_UNPACK(neon_s8, vec, x);
  1317 + DO_QNEG8(vec.v1);
  1318 + DO_QNEG8(vec.v2);
  1319 + DO_QNEG8(vec.v3);
  1320 + DO_QNEG8(vec.v4);
  1321 + NEON_PACK(neon_s8, x, vec);
  1322 + return x;
  1323 +}
  1324 +#undef DO_QNEG8
  1325 +
  1326 +#define DO_QABS16(x) do { \
  1327 + if (x == (int16_t)0x8000) { \
  1328 + x = 0x7fff; \
  1329 + SET_QC(); \
  1330 + } else if (x < 0) { \
  1331 + x = -x; \
  1332 + }} while (0)
  1333 +uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
  1334 +{
  1335 + neon_s16 vec;
  1336 + NEON_UNPACK(neon_s16, vec, x);
  1337 + DO_QABS16(vec.v1);
  1338 + DO_QABS16(vec.v2);
  1339 + NEON_PACK(neon_s16, x, vec);
  1340 + return x;
  1341 +}
  1342 +#undef DO_QABS16
  1343 +
  1344 +#define DO_QNEG16(x) do { \
  1345 + if (x == (int16_t)0x8000) { \
  1346 + x = 0x7fff; \
  1347 + SET_QC(); \
  1348 + } else { \
  1349 + x = -x; \
  1350 + }} while (0)
  1351 +uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
  1352 +{
  1353 + neon_s16 vec;
  1354 + NEON_UNPACK(neon_s16, vec, x);
  1355 + DO_QNEG16(vec.v1);
  1356 + DO_QNEG16(vec.v2);
  1357 + NEON_PACK(neon_s16, x, vec);
  1358 + return x;
  1359 +}
  1360 +#undef DO_QNEG16
  1361 +
  1362 +uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
  1363 +{
  1364 + if (x == SIGNBIT) {
  1365 + SET_QC();
  1366 + x = ~SIGNBIT;
  1367 + } else if ((int32_t)x < 0) {
  1368 + x = -x;
  1369 + }
  1370 + return x;
  1371 +}
  1372 +
  1373 +uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
  1374 +{
  1375 + if (x == SIGNBIT) {
  1376 + SET_QC();
  1377 + x = ~SIGNBIT;
  1378 + } else {
  1379 + x = -x;
  1380 + }
  1381 + return x;
  1382 +}
  1383 +
  1384 +/* NEON Float helpers. */
  1385 +uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
  1386 +{
  1387 + float32 f0 = vfp_itos(a);
  1388 + float32 f1 = vfp_itos(b);
  1389 + return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
  1390 +}
  1391 +
  1392 +uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
  1393 +{
  1394 + float32 f0 = vfp_itos(a);
  1395 + float32 f1 = vfp_itos(b);
  1396 + return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
  1397 +}
  1398 +
  1399 +uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
  1400 +{
  1401 + float32 f0 = vfp_itos(a);
  1402 + float32 f1 = vfp_itos(b);
  1403 + return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
  1404 + ? float32_sub(f0, f1, NFS)
  1405 + : float32_sub(f1, f0, NFS));
  1406 +}
  1407 +
  1408 +uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
  1409 +{
  1410 + return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
  1411 +}
  1412 +
  1413 +uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
  1414 +{
  1415 + return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
  1416 +}
  1417 +
  1418 +uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
  1419 +{
  1420 + return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
  1421 +}
  1422 +
  1423 +/* Floating point comparisons produce an integer result. */
  1424 +#define NEON_VOP_FCMP(name, cmp) \
  1425 +uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
  1426 +{ \
  1427 + if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
  1428 + return ~0; \
  1429 + else \
  1430 + return 0; \
  1431 +}
  1432 +
  1433 +NEON_VOP_FCMP(ceq_f32, ==)
  1434 +NEON_VOP_FCMP(cge_f32, >=)
  1435 +NEON_VOP_FCMP(cgt_f32, >)
  1436 +
  1437 +uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
  1438 +{
  1439 + float32 f0 = float32_abs(vfp_itos(a));
  1440 + float32 f1 = float32_abs(vfp_itos(b));
  1441 + return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
  1442 +}
  1443 +
  1444 +uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
  1445 +{
  1446 + float32 f0 = float32_abs(vfp_itos(a));
  1447 + float32 f1 = float32_abs(vfp_itos(b));
  1448 + return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
  1449 +}
target-arm/op.c
@@ -32,7 +32,5 @@ @@ -32,7 +32,5 @@
32 #include "op_mem.h" 32 #include "op_mem.h"
33 #endif 33 #endif
34 34
35 -#include "op_neon.h"  
36 -  
37 /* iwMMXt support */ 35 /* iwMMXt support */
38 #include "op_iwmmxt.c" 36 #include "op_iwmmxt.c"
target-arm/op_helper.c
@@ -20,6 +20,9 @@ @@ -20,6 +20,9 @@
20 #include "exec.h" 20 #include "exec.h"
21 #include "helpers.h" 21 #include "helpers.h"
22 22
  23 +#define SIGNBIT (uint32_t)0x80000000
  24 +#define SIGNBIT64 ((uint64_t)1 << 63)
  25 +
23 void raise_exception(int tt) 26 void raise_exception(int tt)
24 { 27 {
25 env->exception_index = tt; 28 env->exception_index = tt;
@@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr) @@ -116,7 +119,8 @@ void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr)
116 } 119 }
117 #endif 120 #endif
118 121
119 -#define SIGNBIT (uint32_t)0x80000000 122 +/* FIXME: Pass an axplicit pointer to QF to CPUState, and move saturating
  123 + instructions into helper.c */
120 uint32_t HELPER(add_setq)(uint32_t a, uint32_t b) 124 uint32_t HELPER(add_setq)(uint32_t a, uint32_t b)
121 { 125 {
122 uint32_t res = a + b; 126 uint32_t res = a + b;
@@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i) @@ -451,3 +455,114 @@ uint32_t HELPER(ror_cc)(uint32_t x, uint32_t i)
451 } 455 }
452 } 456 }
453 457
  458 +uint64_t HELPER(neon_add_saturate_s64)(uint64_t src1, uint64_t src2)
  459 +{
  460 + uint64_t res;
  461 +
  462 + res = src1 + src2;
  463 + if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
  464 + env->QF = 1;
  465 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  466 + }
  467 + return res;
  468 +}
  469 +
  470 +uint64_t HELPER(neon_add_saturate_u64)(uint64_t src1, uint64_t src2)
  471 +{
  472 + uint64_t res;
  473 +
  474 + res = src1 + src2;
  475 + if (res < src1) {
  476 + env->QF = 1;
  477 + res = ~(uint64_t)0;
  478 + }
  479 + return res;
  480 +}
  481 +
  482 +uint64_t HELPER(neon_sub_saturate_s64)(uint64_t src1, uint64_t src2)
  483 +{
  484 + uint64_t res;
  485 +
  486 + res = src1 - src2;
  487 + if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
  488 + env->QF = 1;
  489 + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
  490 + }
  491 + return res;
  492 +}
  493 +
  494 +uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2)
  495 +{
  496 + uint64_t res;
  497 +
  498 + if (src1 < src2) {
  499 + env->QF = 1;
  500 + res = 0;
  501 + } else {
  502 + res = src1 - src2;
  503 + }
  504 + return res;
  505 +}
  506 +
  507 +/* These need to return a pair of value, so still use T0/T1. */
  508 +/* Transpose. Argument order is rather strange to avoid special casing
  509 + the tranlation code.
  510 + On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
  511 +void HELPER(neon_trn_u8)(void)
  512 +{
  513 + uint32_t rd;
  514 + uint32_t rm;
  515 + rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
  516 + rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
  517 + T0 = rd;
  518 + T1 = rm;
  519 + FORCE_RET();
  520 +}
  521 +
  522 +void HELPER(neon_trn_u16)(void)
  523 +{
  524 + uint32_t rd;
  525 + uint32_t rm;
  526 + rd = (T0 << 16) | (T1 & 0xffff);
  527 + rm = (T1 >> 16) | (T0 & 0xffff0000);
  528 + T0 = rd;
  529 + T1 = rm;
  530 + FORCE_RET();
  531 +}
  532 +
  533 +/* Worker routines for zip and unzip. */
  534 +void HELPER(neon_unzip_u8)(void)
  535 +{
  536 + uint32_t rd;
  537 + uint32_t rm;
  538 + rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
  539 + | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
  540 + rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
  541 + | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
  542 + T0 = rd;
  543 + T1 = rm;
  544 + FORCE_RET();
  545 +}
  546 +
  547 +void HELPER(neon_zip_u8)(void)
  548 +{
  549 + uint32_t rd;
  550 + uint32_t rm;
  551 + rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
  552 + | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
  553 + rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
  554 + | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
  555 + T0 = rd;
  556 + T1 = rm;
  557 + FORCE_RET();
  558 +}
  559 +
  560 +void HELPER(neon_zip_u16)(void)
  561 +{
  562 + uint32_t tmp;
  563 +
  564 + tmp = (T0 & 0xffff) | (T1 << 16);
  565 + T1 = (T1 & 0xffff0000) | (T0 >> 16);
  566 + T0 = tmp;
  567 + FORCE_RET();
  568 +}
target-arm/op_neon.h deleted 100644 โ†’ 0
1 -/*  
2 - * ARM NEON vector operations.  
3 - *  
4 - * Copyright (c) 2007 CodeSourcery.  
5 - * Written by Paul Brook  
6 - *  
7 - * This code is licenced under the GPL.  
8 - */  
9 -/* Note that for NEON an "l" prefix means it is a wide operation, unlike  
10 - scalar arm ops where it means a word size operation. */  
11 -  
12 -#define SIGNBIT (uint32_t)0x80000000  
13 -/* ??? NEON ops should probably have their own float status. */  
14 -#define NFS &env->vfp.fp_status  
15 -#define NEON_OP(name) void OPPROTO op_neon_##name (void)  
16 -  
17 -/* Helper routines to perform bitwise copies between float and int. */  
18 -static inline float32 vfp_itos(uint32_t i)  
19 -{  
20 - union {  
21 - uint32_t i;  
22 - float32 s;  
23 - } v;  
24 -  
25 - v.i = i;  
26 - return v.s;  
27 -}  
28 -  
29 -static inline uint32_t vfp_stoi(float32 s)  
30 -{  
31 - union {  
32 - uint32_t i;  
33 - float32 s;  
34 - } v;  
35 -  
36 - v.s = s;  
37 - return v.i;  
38 -}  
39 -  
40 -NEON_OP(getreg_T0)  
41 -{  
42 - T0 = *(uint32_t *)((char *) env + PARAM1);  
43 -}  
44 -  
45 -NEON_OP(getreg_T1)  
46 -{  
47 - T1 = *(uint32_t *)((char *) env + PARAM1);  
48 -}  
49 -  
50 -NEON_OP(setreg_T0)  
51 -{  
52 - *(uint32_t *)((char *) env + PARAM1) = T0;  
53 -}  
54 -  
55 -NEON_OP(setreg_T1)  
56 -{  
57 - *(uint32_t *)((char *) env + PARAM1) = T1;  
58 -}  
59 -  
60 -#define NEON_TYPE1(name, type) \  
61 -typedef struct \  
62 -{ \  
63 - type v1; \  
64 -} neon_##name;  
65 -#ifdef WORDS_BIGENDIAN  
66 -#define NEON_TYPE2(name, type) \  
67 -typedef struct \  
68 -{ \  
69 - type v2; \  
70 - type v1; \  
71 -} neon_##name;  
72 -#define NEON_TYPE4(name, type) \  
73 -typedef struct \  
74 -{ \  
75 - type v4; \  
76 - type v3; \  
77 - type v2; \  
78 - type v1; \  
79 -} neon_##name;  
80 -#else  
81 -#define NEON_TYPE2(name, type) \  
82 -typedef struct \  
83 -{ \  
84 - type v1; \  
85 - type v2; \  
86 -} neon_##name;  
87 -#define NEON_TYPE4(name, type) \  
88 -typedef struct \  
89 -{ \  
90 - type v1; \  
91 - type v2; \  
92 - type v3; \  
93 - type v4; \  
94 -} neon_##name;  
95 -#endif  
96 -  
97 -NEON_TYPE4(s8, int8_t)  
98 -NEON_TYPE4(u8, uint8_t)  
99 -NEON_TYPE2(s16, int16_t)  
100 -NEON_TYPE2(u16, uint16_t)  
101 -NEON_TYPE1(s32, int32_t)  
102 -NEON_TYPE1(u32, uint32_t)  
103 -#undef NEON_TYPE4  
104 -#undef NEON_TYPE2  
105 -#undef NEON_TYPE1  
106 -  
107 -/* Copy from a uint32_t to a vector structure type. */  
108 -#define NEON_UNPACK(vtype, dest, val) do { \  
109 - union { \  
110 - vtype v; \  
111 - uint32_t i; \  
112 - } conv_u; \  
113 - conv_u.i = (val); \  
114 - dest = conv_u.v; \  
115 - } while(0)  
116 -  
117 -/* Copy from a vector structure type to a uint32_t. */  
118 -#define NEON_PACK(vtype, dest, val) do { \  
119 - union { \  
120 - vtype v; \  
121 - uint32_t i; \  
122 - } conv_u; \  
123 - conv_u.v = (val); \  
124 - dest = conv_u.i; \  
125 - } while(0)  
126 -  
127 -#define NEON_DO1 \  
128 - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);  
129 -#define NEON_DO2 \  
130 - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \  
131 - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);  
132 -#define NEON_DO4 \  
133 - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \  
134 - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \  
135 - NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \  
136 - NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);  
137 -  
138 -#define NEON_VOP(name, vtype, n) \  
139 -NEON_OP(name) \  
140 -{ \  
141 - vtype vsrc1; \  
142 - vtype vsrc2; \  
143 - vtype vdest; \  
144 - NEON_UNPACK(vtype, vsrc1, T0); \  
145 - NEON_UNPACK(vtype, vsrc2, T1); \  
146 - NEON_DO##n; \  
147 - NEON_PACK(vtype, T0, vdest); \  
148 - FORCE_RET(); \  
149 -}  
150 -  
151 -#define NEON_VOP1(name, vtype, n) \  
152 -NEON_OP(name) \  
153 -{ \  
154 - vtype vsrc1; \  
155 - vtype vdest; \  
156 - NEON_UNPACK(vtype, vsrc1, T0); \  
157 - NEON_DO##n; \  
158 - NEON_PACK(vtype, T0, vdest); \  
159 - FORCE_RET(); \  
160 -}  
161 -  
162 -/* Pairwise operations. */  
163 -/* For 32-bit elements each segment only contains a single element, so  
164 - the elementwise and pairwise operations are the same. */  
165 -#define NEON_PDO2 \  
166 - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \  
167 - NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);  
168 -#define NEON_PDO4 \  
169 - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \  
170 - NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \  
171 - NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \  
172 - NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \  
173 -  
174 -#define NEON_POP(name, vtype, n) \  
175 -NEON_OP(name) \  
176 -{ \  
177 - vtype vsrc1; \  
178 - vtype vsrc2; \  
179 - vtype vdest; \  
180 - NEON_UNPACK(vtype, vsrc1, T0); \  
181 - NEON_UNPACK(vtype, vsrc2, T1); \  
182 - NEON_PDO##n; \  
183 - NEON_PACK(vtype, T0, vdest); \  
184 - FORCE_RET(); \  
185 -}  
186 -  
187 -#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1  
188 -NEON_VOP(hadd_s8, neon_s8, 4)  
189 -NEON_VOP(hadd_u8, neon_u8, 4)  
190 -NEON_VOP(hadd_s16, neon_s16, 2)  
191 -NEON_VOP(hadd_u16, neon_u16, 2)  
192 -#undef NEON_FN  
193 -  
194 -NEON_OP(hadd_s32)  
195 -{  
196 - int32_t src1 = T0;  
197 - int32_t src2 = T1;  
198 - int32_t dest;  
199 -  
200 - dest = (src1 >> 1) + (src2 >> 1);  
201 - if (src1 & src2 & 1)  
202 - dest++;  
203 - T0 = dest;  
204 - FORCE_RET();  
205 -}  
206 -  
207 -NEON_OP(hadd_u32)  
208 -{  
209 - uint32_t src1 = T0;  
210 - uint32_t src2 = T1;  
211 - uint32_t dest;  
212 -  
213 - dest = (src1 >> 1) + (src2 >> 1);  
214 - if (src1 & src2 & 1)  
215 - dest++;  
216 - T0 = dest;  
217 - FORCE_RET();  
218 -}  
219 -  
220 -#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1  
221 -NEON_VOP(rhadd_s8, neon_s8, 4)  
222 -NEON_VOP(rhadd_u8, neon_u8, 4)  
223 -NEON_VOP(rhadd_s16, neon_s16, 2)  
224 -NEON_VOP(rhadd_u16, neon_u16, 2)  
225 -#undef NEON_FN  
226 -  
227 -NEON_OP(rhadd_s32)  
228 -{  
229 - int32_t src1 = T0;  
230 - int32_t src2 = T1;  
231 - int32_t dest;  
232 -  
233 - dest = (src1 >> 1) + (src2 >> 1);  
234 - if ((src1 | src2) & 1)  
235 - dest++;  
236 - T0 = dest;  
237 - FORCE_RET();  
238 -}  
239 -  
240 -NEON_OP(rhadd_u32)  
241 -{  
242 - uint32_t src1 = T0;  
243 - uint32_t src2 = T1;  
244 - uint32_t dest;  
245 -  
246 - dest = (src1 >> 1) + (src2 >> 1);  
247 - if ((src1 | src2) & 1)  
248 - dest++;  
249 - T0 = dest;  
250 - FORCE_RET();  
251 -}  
252 -  
253 -#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1  
254 -NEON_VOP(hsub_s8, neon_s8, 4)  
255 -NEON_VOP(hsub_u8, neon_u8, 4)  
256 -NEON_VOP(hsub_s16, neon_s16, 2)  
257 -NEON_VOP(hsub_u16, neon_u16, 2)  
258 -#undef NEON_FN  
259 -  
260 -NEON_OP(hsub_s32)  
261 -{  
262 - int32_t src1 = T0;  
263 - int32_t src2 = T1;  
264 - int32_t dest;  
265 -  
266 - dest = (src1 >> 1) - (src2 >> 1);  
267 - if ((~src1) & src2 & 1)  
268 - dest--;  
269 - T0 = dest;  
270 - FORCE_RET();  
271 -}  
272 -  
273 -NEON_OP(hsub_u32)  
274 -{  
275 - uint32_t src1 = T0;  
276 - uint32_t src2 = T1;  
277 - uint32_t dest;  
278 -  
279 - dest = (src1 >> 1) - (src2 >> 1);  
280 - if ((~src1) & src2 & 1)  
281 - dest--;  
282 - T0 = dest;  
283 - FORCE_RET();  
284 -}  
285 -  
286 -#define NEON_USAT(dest, src1, src2, type) do { \  
287 - uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \  
288 - if (tmp != (type)tmp) { \  
289 - env->QF = 1; \  
290 - dest = ~0; \  
291 - } else { \  
292 - dest = tmp; \  
293 - }} while(0)  
294 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)  
295 -NEON_VOP(qadd_u8, neon_u8, 4)  
296 -#undef NEON_FN  
297 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)  
298 -NEON_VOP(qadd_u16, neon_u16, 2)  
299 -#undef NEON_FN  
300 -#undef NEON_USAT  
301 -  
302 -#define NEON_SSAT(dest, src1, src2, type) do { \  
303 - int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \  
304 - if (tmp != (type)tmp) { \  
305 - env->QF = 1; \  
306 - if (src2 > 0) { \  
307 - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \  
308 - } else { \  
309 - tmp = 1 << (sizeof(type) * 8 - 1); \  
310 - } \  
311 - } \  
312 - dest = tmp; \  
313 - } while(0)  
314 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)  
315 -NEON_VOP(qadd_s8, neon_s8, 4)  
316 -#undef NEON_FN  
317 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)  
318 -NEON_VOP(qadd_s16, neon_s16, 2)  
319 -#undef NEON_FN  
320 -#undef NEON_SSAT  
321 -  
322 -#define NEON_USAT(dest, src1, src2, type) do { \  
323 - uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \  
324 - if (tmp != (type)tmp) { \  
325 - env->QF = 1; \  
326 - dest = 0; \  
327 - } else { \  
328 - dest = tmp; \  
329 - }} while(0)  
330 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)  
331 -NEON_VOP(qsub_u8, neon_u8, 4)  
332 -#undef NEON_FN  
333 -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)  
334 -NEON_VOP(qsub_u16, neon_u16, 2)  
335 -#undef NEON_FN  
336 -#undef NEON_USAT  
337 -  
338 -#define NEON_SSAT(dest, src1, src2, type) do { \  
339 - int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \  
340 - if (tmp != (type)tmp) { \  
341 - env->QF = 1; \  
342 - if (src2 < 0) { \  
343 - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \  
344 - } else { \  
345 - tmp = 1 << (sizeof(type) * 8 - 1); \  
346 - } \  
347 - } \  
348 - dest = tmp; \  
349 - } while(0)  
350 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)  
351 -NEON_VOP(qsub_s8, neon_s8, 4)  
352 -#undef NEON_FN  
353 -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)  
354 -NEON_VOP(qsub_s16, neon_s16, 2)  
355 -#undef NEON_FN  
356 -#undef NEON_SSAT  
357 -  
358 -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0  
359 -NEON_VOP(cgt_s8, neon_s8, 4)  
360 -NEON_VOP(cgt_u8, neon_u8, 4)  
361 -NEON_VOP(cgt_s16, neon_s16, 2)  
362 -NEON_VOP(cgt_u16, neon_u16, 2)  
363 -NEON_VOP(cgt_s32, neon_s32, 1)  
364 -NEON_VOP(cgt_u32, neon_u32, 1)  
365 -#undef NEON_FN  
366 -  
367 -#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0  
368 -NEON_VOP(cge_s8, neon_s8, 4)  
369 -NEON_VOP(cge_u8, neon_u8, 4)  
370 -NEON_VOP(cge_s16, neon_s16, 2)  
371 -NEON_VOP(cge_u16, neon_u16, 2)  
372 -NEON_VOP(cge_s32, neon_s32, 1)  
373 -NEON_VOP(cge_u32, neon_u32, 1)  
374 -#undef NEON_FN  
375 -  
376 -#define NEON_FN(dest, src1, src2) do { \  
377 - int8_t tmp; \  
378 - tmp = (int8_t)src2; \  
379 - if (tmp < 0) { \  
380 - dest = src1 >> -tmp; \  
381 - } else { \  
382 - dest = src1 << tmp; \  
383 - }} while (0)  
384 -NEON_VOP(shl_s8, neon_s8, 4)  
385 -NEON_VOP(shl_u8, neon_u8, 4)  
386 -NEON_VOP(shl_s16, neon_s16, 2)  
387 -NEON_VOP(shl_u16, neon_u16, 2)  
388 -NEON_VOP(shl_s32, neon_s32, 1)  
389 -NEON_VOP(shl_u32, neon_u32, 1)  
390 -#undef NEON_FN  
391 -  
392 -NEON_OP(shl_u64)  
393 -{  
394 - int8_t shift = env->vfp.scratch[0];  
395 - uint64_t val = T0 | ((uint64_t)T1 << 32);  
396 - if (shift < 0) {  
397 - val >>= -shift;  
398 - } else {  
399 - val <<= shift;  
400 - }  
401 - T0 = val;  
402 - T1 = val >> 32;  
403 - FORCE_RET();  
404 -}  
405 -  
406 -NEON_OP(shl_s64)  
407 -{  
408 - int8_t shift = env->vfp.scratch[0];  
409 - int64_t val = T0 | ((uint64_t)T1 << 32);  
410 - if (shift < 0) {  
411 - val >>= -shift;  
412 - } else {  
413 - val <<= shift;  
414 - }  
415 - T0 = val;  
416 - T1 = val >> 32;  
417 - FORCE_RET();  
418 -}  
419 -  
420 -#define NEON_FN(dest, src1, src2) do { \  
421 - int8_t tmp; \  
422 - tmp = (int8_t)src1; \  
423 - if (tmp < 0) { \  
424 - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \  
425 - } else { \  
426 - dest = src2 << tmp; \  
427 - }} while (0)  
428 -  
429 -NEON_VOP(rshl_s8, neon_s8, 4)  
430 -NEON_VOP(rshl_u8, neon_u8, 4)  
431 -NEON_VOP(rshl_s16, neon_s16, 2)  
432 -NEON_VOP(rshl_u16, neon_u16, 2)  
433 -NEON_VOP(rshl_s32, neon_s32, 1)  
434 -NEON_VOP(rshl_u32, neon_u32, 1)  
435 -#undef NEON_FN  
436 -  
437 -NEON_OP(rshl_u64)  
438 -{  
439 - int8_t shift = env->vfp.scratch[0];  
440 - uint64_t val = T0 | ((uint64_t)T1 << 32);  
441 - if (shift < 0) {  
442 - val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;  
443 - val >>= -shift;  
444 - } else {  
445 - val <<= shift;  
446 - }  
447 - T0 = val;  
448 - T1 = val >> 32;  
449 - FORCE_RET();  
450 -}  
451 -  
452 -NEON_OP(rshl_s64)  
453 -{  
454 - int8_t shift = env->vfp.scratch[0];  
455 - int64_t val = T0 | ((uint64_t)T1 << 32);  
456 - if (shift < 0) {  
457 - val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;  
458 - } else {  
459 - val <<= shift;  
460 - }  
461 - T0 = val;  
462 - T1 = val >> 32;  
463 - FORCE_RET();  
464 -}  
465 -  
466 -#define NEON_FN(dest, src1, src2) do { \  
467 - int8_t tmp; \  
468 - tmp = (int8_t)src1; \  
469 - if (tmp < 0) { \  
470 - dest = src2 >> -tmp; \  
471 - } else { \  
472 - dest = src2 << tmp; \  
473 - if ((dest >> tmp) != src2) { \  
474 - env->QF = 1; \  
475 - dest = ~0; \  
476 - } \  
477 - }} while (0)  
478 -NEON_VOP(qshl_s8, neon_s8, 4)  
479 -NEON_VOP(qshl_s16, neon_s16, 2)  
480 -NEON_VOP(qshl_s32, neon_s32, 1)  
481 -#undef NEON_FN  
482 -  
483 -NEON_OP(qshl_s64)  
484 -{  
485 - int8_t shift = env->vfp.scratch[0];  
486 - int64_t val = T0 | ((uint64_t)T1 << 32);  
487 - if (shift < 0) {  
488 - val >>= -shift;  
489 - } else {  
490 - int64_t tmp = val;  
491 - val <<= shift;  
492 - if ((val >> shift) != tmp) {  
493 - env->QF = 1;  
494 - val = (tmp >> 63) ^ 0x7fffffffffffffffULL;  
495 - }  
496 - }  
497 - T0 = val;  
498 - T1 = val >> 32;  
499 - FORCE_RET();  
500 -}  
501 -  
502 -#define NEON_FN(dest, src1, src2) do { \  
503 - int8_t tmp; \  
504 - tmp = (int8_t)src1; \  
505 - if (tmp < 0) { \  
506 - dest = src2 >> -tmp; \  
507 - } else { \  
508 - dest = src2 << tmp; \  
509 - if ((dest >> tmp) != src2) { \  
510 - env->QF = 1; \  
511 - dest = src2 >> 31; \  
512 - } \  
513 - }} while (0)  
514 -NEON_VOP(qshl_u8, neon_u8, 4)  
515 -NEON_VOP(qshl_u16, neon_u16, 2)  
516 -NEON_VOP(qshl_u32, neon_u32, 1)  
517 -#undef NEON_FN  
518 -  
519 -NEON_OP(qshl_u64)  
520 -{  
521 - int8_t shift = env->vfp.scratch[0];  
522 - uint64_t val = T0 | ((uint64_t)T1 << 32);  
523 - if (shift < 0) {  
524 - val >>= -shift;  
525 - } else {  
526 - uint64_t tmp = val;  
527 - val <<= shift;  
528 - if ((val >> shift) != tmp) {  
529 - env->QF = 1;  
530 - val = ~(uint64_t)0;  
531 - }  
532 - }  
533 - T0 = val;  
534 - T1 = val >> 32;  
535 - FORCE_RET();  
536 -}  
537 -  
538 -#define NEON_FN(dest, src1, src2) do { \  
539 - int8_t tmp; \  
540 - tmp = (int8_t)src1; \  
541 - if (tmp < 0) { \  
542 - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \  
543 - } else { \  
544 - dest = src2 << tmp; \  
545 - if ((dest >> tmp) != src2) { \  
546 - dest = ~0; \  
547 - } \  
548 - }} while (0)  
549 -NEON_VOP(qrshl_s8, neon_s8, 4)  
550 -NEON_VOP(qrshl_s16, neon_s16, 2)  
551 -NEON_VOP(qrshl_s32, neon_s32, 1)  
552 -#undef NEON_FN  
553 -  
554 -#define NEON_FN(dest, src1, src2) do { \  
555 - int8_t tmp; \  
556 - tmp = (int8_t)src1; \  
557 - if (tmp < 0) { \  
558 - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \  
559 - } else { \  
560 - dest = src2 << tmp; \  
561 - if ((dest >> tmp) != src2) { \  
562 - env->QF = 1; \  
563 - dest = src2 >> 31; \  
564 - } \  
565 - }} while (0)  
566 -NEON_VOP(qrshl_u8, neon_u8, 4)  
567 -NEON_VOP(qrshl_u16, neon_u16, 2)  
568 -NEON_VOP(qrshl_u32, neon_u32, 1)  
569 -#undef NEON_FN  
570 -  
571 -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2  
572 -NEON_VOP(max_s8, neon_s8, 4)  
573 -NEON_VOP(max_u8, neon_u8, 4)  
574 -NEON_VOP(max_s16, neon_s16, 2)  
575 -NEON_VOP(max_u16, neon_u16, 2)  
576 -NEON_VOP(max_s32, neon_s32, 1)  
577 -NEON_VOP(max_u32, neon_u32, 1)  
578 -NEON_POP(pmax_s8, neon_s8, 4)  
579 -NEON_POP(pmax_u8, neon_u8, 4)  
580 -NEON_POP(pmax_s16, neon_s16, 2)  
581 -NEON_POP(pmax_u16, neon_u16, 2)  
582 -#undef NEON_FN  
583 -  
584 -NEON_OP(max_f32)  
585 -{  
586 - float32 f0 = vfp_itos(T0);  
587 - float32 f1 = vfp_itos(T1);  
588 - T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;  
589 - FORCE_RET();  
590 -}  
591 -  
592 -#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2  
593 -NEON_VOP(min_s8, neon_s8, 4)  
594 -NEON_VOP(min_u8, neon_u8, 4)  
595 -NEON_VOP(min_s16, neon_s16, 2)  
596 -NEON_VOP(min_u16, neon_u16, 2)  
597 -NEON_VOP(min_s32, neon_s32, 1)  
598 -NEON_VOP(min_u32, neon_u32, 1)  
599 -NEON_POP(pmin_s8, neon_s8, 4)  
600 -NEON_POP(pmin_u8, neon_u8, 4)  
601 -NEON_POP(pmin_s16, neon_s16, 2)  
602 -NEON_POP(pmin_u16, neon_u16, 2)  
603 -#undef NEON_FN  
604 -  
605 -NEON_OP(min_f32)  
606 -{  
607 - float32 f0 = vfp_itos(T0);  
608 - float32 f1 = vfp_itos(T1);  
609 - T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;  
610 - FORCE_RET();  
611 -}  
612 -  
613 -#define NEON_FN(dest, src1, src2) \  
614 - dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)  
615 -NEON_VOP(abd_s8, neon_s8, 4)  
616 -NEON_VOP(abd_u8, neon_u8, 4)  
617 -NEON_VOP(abd_s16, neon_s16, 2)  
618 -NEON_VOP(abd_u16, neon_u16, 2)  
619 -NEON_VOP(abd_s32, neon_s32, 1)  
620 -NEON_VOP(abd_u32, neon_u32, 1)  
621 -#undef NEON_FN  
622 -  
623 -NEON_OP(abd_f32)  
624 -{  
625 - float32 f0 = vfp_itos(T0);  
626 - float32 f1 = vfp_itos(T1);  
627 - T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)  
628 - ? float32_sub(f0, f1, NFS)  
629 - : float32_sub(f1, f0, NFS));  
630 - FORCE_RET();  
631 -}  
632 -  
633 -#define NEON_FN(dest, src1, src2) dest = src1 + src2  
634 -NEON_VOP(add_u8, neon_u8, 4)  
635 -NEON_VOP(add_u16, neon_u16, 2)  
636 -NEON_POP(padd_u8, neon_u8, 4)  
637 -NEON_POP(padd_u16, neon_u16, 2)  
638 -#undef NEON_FN  
639 -  
640 -NEON_OP(add_f32)  
641 -{  
642 - T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));  
643 - FORCE_RET();  
644 -}  
645 -  
646 -#define NEON_FN(dest, src1, src2) dest = src1 - src2  
647 -NEON_VOP(sub_u8, neon_u8, 4)  
648 -NEON_VOP(sub_u16, neon_u16, 2)  
649 -#undef NEON_FN  
650 -  
651 -NEON_OP(sub_f32)  
652 -{  
653 - T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));  
654 - FORCE_RET();  
655 -}  
656 -  
657 -#define NEON_FN(dest, src1, src2) dest = src2 - src1  
658 -NEON_VOP(rsb_u8, neon_u8, 4)  
659 -NEON_VOP(rsb_u16, neon_u16, 2)  
660 -#undef NEON_FN  
661 -  
662 -NEON_OP(rsb_f32)  
663 -{  
664 - T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));  
665 - FORCE_RET();  
666 -}  
667 -  
668 -#define NEON_FN(dest, src1, src2) dest = src1 * src2  
669 -NEON_VOP(mul_u8, neon_u8, 4)  
670 -NEON_VOP(mul_u16, neon_u16, 2)  
671 -#undef NEON_FN  
672 -  
673 -NEON_OP(mul_f32)  
674 -{  
675 - T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));  
676 - FORCE_RET();  
677 -}  
678 -  
679 -NEON_OP(mul_p8)  
680 -{  
681 - T0 = helper_neon_mul_p8(T0, T1);  
682 -}  
683 -  
684 -#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0  
685 -NEON_VOP(tst_u8, neon_u8, 4)  
686 -NEON_VOP(tst_u16, neon_u16, 2)  
687 -NEON_VOP(tst_u32, neon_u32, 1)  
688 -#undef NEON_FN  
689 -  
690 -#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0  
691 -NEON_VOP(ceq_u8, neon_u8, 4)  
692 -NEON_VOP(ceq_u16, neon_u16, 2)  
693 -NEON_VOP(ceq_u32, neon_u32, 1)  
694 -#undef NEON_FN  
695 -  
696 -#define NEON_QDMULH16(dest, src1, src2, round) do { \  
697 - uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \  
698 - if ((tmp ^ (tmp << 1)) & SIGNBIT) { \  
699 - env->QF = 1; \  
700 - tmp = (tmp >> 31) ^ ~SIGNBIT; \  
701 - } \  
702 - tmp <<= 1; \  
703 - if (round) { \  
704 - int32_t old = tmp; \  
705 - tmp += 1 << 15; \  
706 - if ((int32_t)tmp < old) { \  
707 - env->QF = 1; \  
708 - tmp = SIGNBIT - 1; \  
709 - } \  
710 - } \  
711 - dest = tmp >> 16; \  
712 - } while(0)  
713 -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)  
714 -NEON_VOP(qdmulh_s16, neon_s16, 2)  
715 -#undef NEON_FN  
716 -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)  
717 -NEON_VOP(qrdmulh_s16, neon_s16, 2)  
718 -#undef NEON_FN  
719 -#undef NEON_QDMULH16  
720 -  
721 -#define SIGNBIT64 ((uint64_t)1 << 63)  
722 -#define NEON_QDMULH32(dest, src1, src2, round) do { \  
723 - uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \  
724 - if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \  
725 - env->QF = 1; \  
726 - tmp = (tmp >> 63) ^ ~SIGNBIT64; \  
727 - } else { \  
728 - tmp <<= 1; \  
729 - } \  
730 - if (round) { \  
731 - int64_t old = tmp; \  
732 - tmp += (int64_t)1 << 31; \  
733 - if ((int64_t)tmp < old) { \  
734 - env->QF = 1; \  
735 - tmp = SIGNBIT64 - 1; \  
736 - } \  
737 - } \  
738 - dest = tmp >> 32; \  
739 - } while(0)  
740 -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)  
741 -NEON_VOP(qdmulh_s32, neon_s32, 1)  
742 -#undef NEON_FN  
743 -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)  
744 -NEON_VOP(qrdmulh_s32, neon_s32, 1)  
745 -#undef NEON_FN  
746 -#undef NEON_QDMULH32  
747 -  
748 -/* Floating point comparisons produce an integer result. */  
749 -#define NEON_VOP_FCMP(name, cmp) \  
750 -NEON_OP(name) \  
751 -{ \  
752 - if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \  
753 - T0 = -1; \  
754 - else \  
755 - T0 = 0; \  
756 - FORCE_RET(); \  
757 -}  
758 -  
759 -NEON_VOP_FCMP(ceq_f32, ==)  
760 -NEON_VOP_FCMP(cge_f32, >=)  
761 -NEON_VOP_FCMP(cgt_f32, >)  
762 -  
763 -NEON_OP(acge_f32)  
764 -{  
765 - float32 f0 = float32_abs(vfp_itos(T0));  
766 - float32 f1 = float32_abs(vfp_itos(T1));  
767 - T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;  
768 - FORCE_RET();  
769 -}  
770 -  
771 -NEON_OP(acgt_f32)  
772 -{  
773 - float32 f0 = float32_abs(vfp_itos(T0));  
774 - float32 f1 = float32_abs(vfp_itos(T1));  
775 - T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;  
776 - FORCE_RET();  
777 -}  
778 -  
779 -/* Narrowing instructions. The named type is the destination type. */  
780 -NEON_OP(narrow_u8)  
781 -{  
782 - T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)  
783 - | ((T1 << 16) & 0xff0000) | (T1 << 24);  
784 - FORCE_RET();  
785 -}  
786 -  
787 -NEON_OP(narrow_sat_u8)  
788 -{  
789 - neon_u16 src;  
790 - neon_u8 dest;  
791 -#define SAT8(d, s) \  
792 - if (s > 0xff) { \  
793 - d = 0xff; \  
794 - env->QF = 1; \  
795 - } else { \  
796 - d = s; \  
797 - }  
798 -  
799 - NEON_UNPACK(neon_u16, src, T0);  
800 - SAT8(dest.v1, src.v1);  
801 - SAT8(dest.v2, src.v2);  
802 - NEON_UNPACK(neon_u16, src, T1);  
803 - SAT8(dest.v3, src.v1);  
804 - SAT8(dest.v4, src.v2);  
805 - NEON_PACK(neon_u8, T0, dest);  
806 - FORCE_RET();  
807 -#undef SAT8  
808 -}  
809 -  
810 -NEON_OP(narrow_sat_s8)  
811 -{  
812 - neon_s16 src;  
813 - neon_s8 dest;  
814 -#define SAT8(d, s) \  
815 - if (s != (uint8_t)s) { \  
816 - d = (s >> 15) ^ 0x7f; \  
817 - env->QF = 1; \  
818 - } else { \  
819 - d = s; \  
820 - }  
821 -  
822 - NEON_UNPACK(neon_s16, src, T0);  
823 - SAT8(dest.v1, src.v1);  
824 - SAT8(dest.v2, src.v2);  
825 - NEON_UNPACK(neon_s16, src, T1);  
826 - SAT8(dest.v3, src.v1);  
827 - SAT8(dest.v4, src.v2);  
828 - NEON_PACK(neon_s8, T0, dest);  
829 - FORCE_RET();  
830 -#undef SAT8  
831 -}  
832 -  
833 -NEON_OP(narrow_u16)  
834 -{  
835 - T0 = (T0 & 0xffff) | (T1 << 16);  
836 -}  
837 -  
838 -NEON_OP(narrow_sat_u16)  
839 -{  
840 - if (T0 > 0xffff) {  
841 - T0 = 0xffff;  
842 - env->QF = 1;  
843 - }  
844 - if (T1 > 0xffff) {  
845 - T1 = 0xffff;  
846 - env->QF = 1;  
847 - }  
848 - T0 |= T1 << 16;  
849 - FORCE_RET();  
850 -}  
851 -  
852 -NEON_OP(narrow_sat_s16)  
853 -{  
854 - if ((int32_t)T0 != (int16_t)T0) {  
855 - T0 = ((int32_t)T0 >> 31) ^ 0x7fff;  
856 - env->QF = 1;  
857 - }  
858 - if ((int32_t)T1 != (int16_t) T1) {  
859 - T1 = ((int32_t)T1 >> 31) ^ 0x7fff;  
860 - env->QF = 1;  
861 - }  
862 - T0 = (uint16_t)T0 | (T1 << 16);  
863 - FORCE_RET();  
864 -}  
865 -  
866 -NEON_OP(narrow_sat_u32)  
867 -{  
868 - if (T1) {  
869 - T0 = 0xffffffffu;  
870 - env->QF = 1;  
871 - }  
872 - FORCE_RET();  
873 -}  
874 -  
875 -NEON_OP(narrow_sat_s32)  
876 -{  
877 - int32_t sign = (int32_t)T1 >> 31;  
878 -  
879 - if ((int32_t)T1 != sign) {  
880 - T0 = sign ^ 0x7fffffff;  
881 - env->QF = 1;  
882 - }  
883 - FORCE_RET();  
884 -}  
885 -  
886 -/* Narrowing instructions. Named type is the narrow type. */  
887 -NEON_OP(narrow_high_u8)  
888 -{  
889 - T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)  
890 - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);  
891 - FORCE_RET();  
892 -}  
893 -  
894 -NEON_OP(narrow_high_u16)  
895 -{  
896 - T0 = (T0 >> 16) | (T1 & 0xffff0000);  
897 - FORCE_RET();  
898 -}  
899 -  
900 -NEON_OP(narrow_high_round_u8)  
901 -{  
902 - T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)  
903 - | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);  
904 - FORCE_RET();  
905 -}  
906 -  
907 -NEON_OP(narrow_high_round_u16)  
908 -{  
909 - T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);  
910 - FORCE_RET();  
911 -}  
912 -  
913 -NEON_OP(narrow_high_round_u32)  
914 -{  
915 - if (T0 >= 0x80000000u)  
916 - T0 = T1 + 1;  
917 - else  
918 - T0 = T1;  
919 - FORCE_RET();  
920 -}  
921 -  
922 -/* Widening instructions. Named type is source type. */  
923 -NEON_OP(widen_s8)  
924 -{  
925 - uint32_t src;  
926 -  
927 - src = T0;  
928 - T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);  
929 - T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);  
930 -}  
931 -  
932 -NEON_OP(widen_u8)  
933 -{  
934 - T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);  
935 - T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);  
936 -}  
937 -  
938 -NEON_OP(widen_s16)  
939 -{  
940 - int32_t src;  
941 -  
942 - src = T0;  
943 - T0 = (int16_t)src;  
944 - T1 = src >> 16;  
945 -}  
946 -  
947 -NEON_OP(widen_u16)  
948 -{  
949 - T1 = T0 >> 16;  
950 - T0 &= 0xffff;  
951 -}  
952 -  
953 -NEON_OP(widen_s32)  
954 -{  
955 - T1 = (int32_t)T0 >> 31;  
956 - FORCE_RET();  
957 -}  
958 -  
959 -NEON_OP(widen_high_u8)  
960 -{  
961 - T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);  
962 - T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);  
963 -}  
964 -  
965 -NEON_OP(widen_high_u16)  
966 -{  
967 - T1 = T0 & 0xffff0000;  
968 - T0 <<= 16;  
969 -}  
970 -  
971 -/* Long operations. The type is the wide type. */  
972 -NEON_OP(shll_u16)  
973 -{  
974 - int shift = PARAM1;  
975 - uint32_t mask;  
976 -  
977 - mask = 0xffff >> (16 - shift);  
978 - mask |= mask << 16;  
979 - mask = ~mask;  
980 -  
981 - T0 = (T0 << shift) & mask;  
982 - T1 = (T1 << shift) & mask;  
983 - FORCE_RET();  
984 -}  
985 -  
986 -NEON_OP(shll_u64)  
987 -{  
988 - int shift = PARAM1;  
989 -  
990 - T1 <<= shift;  
991 - T1 |= T0 >> (32 - shift);  
992 - T0 <<= shift;  
993 - FORCE_RET();  
994 -}  
995 -  
996 -NEON_OP(addl_u16)  
997 -{  
998 - uint32_t tmp;  
999 - uint32_t high;  
1000 -  
1001 - tmp = env->vfp.scratch[0];  
1002 - high = (T0 >> 16) + (tmp >> 16);  
1003 - T0 = (uint16_t)(T0 + tmp);  
1004 - T0 |= (high << 16);  
1005 - tmp = env->vfp.scratch[1];  
1006 - high = (T1 >> 16) + (tmp >> 16);  
1007 - T1 = (uint16_t)(T1 + tmp);  
1008 - T1 |= (high << 16);  
1009 - FORCE_RET();  
1010 -}  
1011 -  
1012 -NEON_OP(addl_u32)  
1013 -{  
1014 - T0 += env->vfp.scratch[0];  
1015 - T1 += env->vfp.scratch[1];  
1016 - FORCE_RET();  
1017 -}  
1018 -  
1019 -NEON_OP(addl_u64)  
1020 -{  
1021 - uint64_t tmp;  
1022 - tmp = T0 | ((uint64_t)T1 << 32);  
1023 - tmp += env->vfp.scratch[0];  
1024 - tmp += (uint64_t)env->vfp.scratch[1] << 32;  
1025 - T0 = tmp;  
1026 - T1 = tmp >> 32;  
1027 - FORCE_RET();  
1028 -}  
1029 -  
1030 -NEON_OP(subl_u16)  
1031 -{  
1032 - uint32_t tmp;  
1033 - uint32_t high;  
1034 -  
1035 - tmp = env->vfp.scratch[0];  
1036 - high = (T0 >> 16) - (tmp >> 16);  
1037 - T0 = (uint16_t)(T0 - tmp);  
1038 - T0 |= (high << 16);  
1039 - tmp = env->vfp.scratch[1];  
1040 - high = (T1 >> 16) - (tmp >> 16);  
1041 - T1 = (uint16_t)(T1 - tmp);  
1042 - T1 |= (high << 16);  
1043 - FORCE_RET();  
1044 -}  
1045 -  
1046 -NEON_OP(subl_u32)  
1047 -{  
1048 - T0 -= env->vfp.scratch[0];  
1049 - T1 -= env->vfp.scratch[1];  
1050 - FORCE_RET();  
1051 -}  
1052 -  
1053 -NEON_OP(subl_u64)  
1054 -{  
1055 - uint64_t tmp;  
1056 - tmp = T0 | ((uint64_t)T1 << 32);  
1057 - tmp -= env->vfp.scratch[0];  
1058 - tmp -= (uint64_t)env->vfp.scratch[1] << 32;  
1059 - T0 = tmp;  
1060 - T1 = tmp >> 32;  
1061 - FORCE_RET();  
1062 -}  
1063 -  
1064 -#define DO_ABD(dest, x, y, type) do { \  
1065 - type tmp_x = x; \  
1066 - type tmp_y = y; \  
1067 - dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \  
1068 - } while(0)  
1069 -  
1070 -NEON_OP(abdl_u16)  
1071 -{  
1072 - uint32_t tmp;  
1073 - uint32_t low;  
1074 - uint32_t high;  
1075 -  
1076 - DO_ABD(low, T0, T1, uint8_t);  
1077 - DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);  
1078 - low |= tmp << 16;  
1079 - DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);  
1080 - DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);  
1081 - high |= tmp << 16;  
1082 - T0 = low;  
1083 - T1 = high;  
1084 - FORCE_RET();  
1085 -}  
1086 -  
1087 -NEON_OP(abdl_s16)  
1088 -{  
1089 - uint32_t tmp;  
1090 - uint32_t low;  
1091 - uint32_t high;  
1092 -  
1093 - DO_ABD(low, T0, T1, int8_t);  
1094 - DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);  
1095 - low |= tmp << 16;  
1096 - DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);  
1097 - DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);  
1098 - high |= tmp << 16;  
1099 - T0 = low;  
1100 - T1 = high;  
1101 - FORCE_RET();  
1102 -}  
1103 -  
1104 -NEON_OP(abdl_u32)  
1105 -{  
1106 - uint32_t low;  
1107 - uint32_t high;  
1108 -  
1109 - DO_ABD(low, T0, T1, uint16_t);  
1110 - DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);  
1111 - T0 = low;  
1112 - T1 = high;  
1113 - FORCE_RET();  
1114 -}  
1115 -  
1116 -NEON_OP(abdl_s32)  
1117 -{  
1118 - uint32_t low;  
1119 - uint32_t high;  
1120 -  
1121 - DO_ABD(low, T0, T1, int16_t);  
1122 - DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);  
1123 - T0 = low;  
1124 - T1 = high;  
1125 - FORCE_RET();  
1126 -}  
1127 -  
1128 -NEON_OP(abdl_u64)  
1129 -{  
1130 - DO_ABD(T0, T0, T1, uint32_t);  
1131 - T1 = 0;  
1132 -}  
1133 -  
1134 -NEON_OP(abdl_s64)  
1135 -{  
1136 - DO_ABD(T0, T0, T1, int32_t);  
1137 - T1 = 0;  
1138 -}  
1139 -#undef DO_ABD  
1140 -  
1141 -/* Widening multiple. Named type is the source type. */  
1142 -#define DO_MULL(dest, x, y, type1, type2) do { \  
1143 - type1 tmp_x = x; \  
1144 - type1 tmp_y = y; \  
1145 - dest = (type2)((type2)tmp_x * (type2)tmp_y); \  
1146 - } while(0)  
1147 -  
1148 -NEON_OP(mull_u8)  
1149 -{  
1150 - uint32_t tmp;  
1151 - uint32_t low;  
1152 - uint32_t high;  
1153 -  
1154 - DO_MULL(low, T0, T1, uint8_t, uint16_t);  
1155 - DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);  
1156 - low |= tmp << 16;  
1157 - DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);  
1158 - DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);  
1159 - high |= tmp << 16;  
1160 - T0 = low;  
1161 - T1 = high;  
1162 - FORCE_RET();  
1163 -}  
1164 -  
1165 -NEON_OP(mull_s8)  
1166 -{  
1167 - uint32_t tmp;  
1168 - uint32_t low;  
1169 - uint32_t high;  
1170 -  
1171 - DO_MULL(low, T0, T1, int8_t, uint16_t);  
1172 - DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);  
1173 - low |= tmp << 16;  
1174 - DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);  
1175 - DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);  
1176 - high |= tmp << 16;  
1177 - T0 = low;  
1178 - T1 = high;  
1179 - FORCE_RET();  
1180 -}  
1181 -  
1182 -NEON_OP(mull_u16)  
1183 -{  
1184 - uint32_t low;  
1185 - uint32_t high;  
1186 -  
1187 - DO_MULL(low, T0, T1, uint16_t, uint32_t);  
1188 - DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);  
1189 - T0 = low;  
1190 - T1 = high;  
1191 - FORCE_RET();  
1192 -}  
1193 -  
1194 -NEON_OP(mull_s16)  
1195 -{  
1196 - uint32_t low;  
1197 - uint32_t high;  
1198 -  
1199 - DO_MULL(low, T0, T1, int16_t, uint32_t);  
1200 - DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);  
1201 - T0 = low;  
1202 - T1 = high;  
1203 - FORCE_RET();  
1204 -}  
1205 -  
1206 -NEON_OP(addl_saturate_s32)  
1207 -{  
1208 - uint32_t tmp;  
1209 - uint32_t res;  
1210 -  
1211 - tmp = env->vfp.scratch[0];  
1212 - res = T0 + tmp;  
1213 - if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {  
1214 - env->QF = 1;  
1215 - T0 = (T0 >> 31) ^ 0x7fffffff;  
1216 - } else {  
1217 - T0 = res;  
1218 - }  
1219 - tmp = env->vfp.scratch[1];  
1220 - res = T1 + tmp;  
1221 - if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {  
1222 - env->QF = 1;  
1223 - T1 = (T1 >> 31) ^ 0x7fffffff;  
1224 - } else {  
1225 - T1 = res;  
1226 - }  
1227 - FORCE_RET();  
1228 -}  
1229 -  
1230 -NEON_OP(addl_saturate_s64)  
1231 -{  
1232 - uint64_t src1;  
1233 - uint64_t src2;  
1234 - uint64_t res;  
1235 -  
1236 - src1 = T0 + ((uint64_t)T1 << 32);  
1237 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1238 - res = src1 + src2;  
1239 - if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {  
1240 - env->QF = 1;  
1241 - T0 = ~(int64_t)src1 >> 63;  
1242 - T1 = T0 ^ 0x80000000;  
1243 - } else {  
1244 - T0 = res;  
1245 - T1 = res >> 32;  
1246 - }  
1247 - FORCE_RET();  
1248 -}  
1249 -  
1250 -NEON_OP(addl_saturate_u64)  
1251 -{  
1252 - uint64_t src1;  
1253 - uint64_t src2;  
1254 - uint64_t res;  
1255 -  
1256 - src1 = T0 + ((uint64_t)T1 << 32);  
1257 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1258 - res = src1 + src2;  
1259 - if (res < src1) {  
1260 - env->QF = 1;  
1261 - T0 = 0xffffffff;  
1262 - T1 = 0xffffffff;  
1263 - } else {  
1264 - T0 = res;  
1265 - T1 = res >> 32;  
1266 - }  
1267 - FORCE_RET();  
1268 -}  
1269 -  
1270 -NEON_OP(subl_saturate_s64)  
1271 -{  
1272 - uint64_t src1;  
1273 - uint64_t src2;  
1274 - uint64_t res;  
1275 -  
1276 - src1 = T0 + ((uint64_t)T1 << 32);  
1277 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1278 - res = src1 - src2;  
1279 - if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {  
1280 - env->QF = 1;  
1281 - T0 = ~(int64_t)src1 >> 63;  
1282 - T1 = T0 ^ 0x80000000;  
1283 - } else {  
1284 - T0 = res;  
1285 - T1 = res >> 32;  
1286 - }  
1287 - FORCE_RET();  
1288 -}  
1289 -  
1290 -NEON_OP(subl_saturate_u64)  
1291 -{  
1292 - uint64_t src1;  
1293 - uint64_t src2;  
1294 - uint64_t res;  
1295 -  
1296 - src1 = T0 + ((uint64_t)T1 << 32);  
1297 - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);  
1298 - if (src1 < src2) {  
1299 - env->QF = 1;  
1300 - T0 = 0;  
1301 - T1 = 0;  
1302 - } else {  
1303 - res = src1 - src2;  
1304 - T0 = res;  
1305 - T1 = res >> 32;  
1306 - }  
1307 - FORCE_RET();  
1308 -}  
1309 -  
1310 -NEON_OP(negl_u16)  
1311 -{  
1312 - uint32_t tmp;  
1313 - tmp = T0 >> 16;  
1314 - tmp = -tmp;  
1315 - T0 = (-T0 & 0xffff) | (tmp << 16);  
1316 - tmp = T1 >> 16;  
1317 - tmp = -tmp;  
1318 - T1 = (-T1 & 0xffff) | (tmp << 16);  
1319 - FORCE_RET();  
1320 -}  
1321 -  
1322 -NEON_OP(negl_u32)  
1323 -{  
1324 - T0 = -T0;  
1325 - T1 = -T1;  
1326 - FORCE_RET();  
1327 -}  
1328 -  
1329 -NEON_OP(negl_u64)  
1330 -{  
1331 - uint64_t val;  
1332 -  
1333 - val = T0 | ((uint64_t)T1 << 32);  
1334 - val = -val;  
1335 - T0 = val;  
1336 - T1 = val >> 32;  
1337 - FORCE_RET();  
1338 -}  
1339 -  
1340 -/* Scalar operations. */  
1341 -NEON_OP(dup_low16)  
1342 -{  
1343 - T0 = (T0 & 0xffff) | (T0 << 16);  
1344 - FORCE_RET();  
1345 -}  
1346 -  
1347 -NEON_OP(dup_high16)  
1348 -{  
1349 - T0 = (T0 >> 16) | (T0 & 0xffff0000);  
1350 - FORCE_RET();  
1351 -}  
1352 -  
1353 -/* Helper for VEXT */  
1354 -NEON_OP(extract)  
1355 -{  
1356 - int shift = PARAM1;  
1357 - T0 = (T0 >> shift) | (T1 << (32 - shift));  
1358 - FORCE_RET();  
1359 -}  
1360 -  
1361 -/* Pairwise add long. Named type is source type. */  
1362 -NEON_OP(paddl_s8)  
1363 -{  
1364 - int8_t src1;  
1365 - int8_t src2;  
1366 - uint16_t result;  
1367 - src1 = T0 >> 24;  
1368 - src2 = T0 >> 16;  
1369 - result = (uint16_t)src1 + src2;  
1370 - src1 = T0 >> 8;  
1371 - src2 = T0;  
1372 - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);  
1373 - FORCE_RET();  
1374 -}  
1375 -  
1376 -NEON_OP(paddl_u8)  
1377 -{  
1378 - uint8_t src1;  
1379 - uint8_t src2;  
1380 - uint16_t result;  
1381 - src1 = T0 >> 24;  
1382 - src2 = T0 >> 16;  
1383 - result = (uint16_t)src1 + src2;  
1384 - src1 = T0 >> 8;  
1385 - src2 = T0;  
1386 - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);  
1387 - FORCE_RET();  
1388 -}  
1389 -  
1390 -NEON_OP(paddl_s16)  
1391 -{  
1392 - T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);  
1393 - FORCE_RET();  
1394 -}  
1395 -  
1396 -NEON_OP(paddl_u16)  
1397 -{  
1398 - T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);  
1399 - FORCE_RET();  
1400 -}  
1401 -  
1402 -NEON_OP(paddl_s32)  
1403 -{  
1404 - int64_t tmp;  
1405 - tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;  
1406 - T0 = tmp;  
1407 - T1 = tmp >> 32;  
1408 - FORCE_RET();  
1409 -}  
1410 -  
1411 -NEON_OP(paddl_u32)  
1412 -{  
1413 - uint64_t tmp;  
1414 - tmp = (uint64_t)T0 + (uint64_t)T1;  
1415 - T0 = tmp;  
1416 - T1 = tmp >> 32;  
1417 - FORCE_RET();  
1418 -}  
1419 -  
1420 -/* Count Leading Sign/Zero Bits. */  
1421 -static inline int do_clz8(uint8_t x)  
1422 -{  
1423 - int n;  
1424 - for (n = 8; x; n--)  
1425 - x >>= 1;  
1426 - return n;  
1427 -}  
1428 -  
1429 -static inline int do_clz16(uint16_t x)  
1430 -{  
1431 - int n;  
1432 - for (n = 16; x; n--)  
1433 - x >>= 1;  
1434 - return n;  
1435 -}  
1436 -  
1437 -NEON_OP(clz_u8)  
1438 -{  
1439 - uint32_t result;  
1440 - uint32_t tmp;  
1441 -  
1442 - tmp = T0;  
1443 - result = do_clz8(tmp);  
1444 - result |= do_clz8(tmp >> 8) << 8;  
1445 - result |= do_clz8(tmp >> 16) << 16;  
1446 - result |= do_clz8(tmp >> 24) << 24;  
1447 - T0 = result;  
1448 - FORCE_RET();  
1449 -}  
1450 -  
1451 -NEON_OP(clz_u16)  
1452 -{  
1453 - uint32_t result;  
1454 - uint32_t tmp;  
1455 - tmp = T0;  
1456 - result = do_clz16(tmp);  
1457 - result |= do_clz16(tmp >> 16) << 16;  
1458 - T0 = result;  
1459 - FORCE_RET();  
1460 -}  
1461 -  
1462 -NEON_OP(cls_s8)  
1463 -{  
1464 - uint32_t result;  
1465 - int8_t tmp;  
1466 - tmp = T0;  
1467 - result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;  
1468 - tmp = T0 >> 8;  
1469 - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;  
1470 - tmp = T0 >> 16;  
1471 - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;  
1472 - tmp = T0 >> 24;  
1473 - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;  
1474 - T0 = result;  
1475 - FORCE_RET();  
1476 -}  
1477 -  
1478 -NEON_OP(cls_s16)  
1479 -{  
1480 - uint32_t result;  
1481 - int16_t tmp;  
1482 - tmp = T0;  
1483 - result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;  
1484 - tmp = T0 >> 16;  
1485 - result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;  
1486 - T0 = result;  
1487 - FORCE_RET();  
1488 -}  
1489 -  
1490 -NEON_OP(cls_s32)  
1491 -{  
1492 - int count;  
1493 - if ((int32_t)T0 < 0)  
1494 - T0 = ~T0;  
1495 - for (count = 32; T0 > 0; count--)  
1496 - T0 = T0 >> 1;  
1497 - T0 = count - 1;  
1498 - FORCE_RET();  
1499 -}  
1500 -  
1501 -/* Bit count. */  
1502 -NEON_OP(cnt_u8)  
1503 -{  
1504 - T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555);  
1505 - T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333);  
1506 - T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f);  
1507 - FORCE_RET();  
1508 -}  
1509 -  
1510 -/* Saturnating negation. */  
1511 -/* ??? Make these use NEON_VOP1 */  
1512 -#define DO_QABS8(x) do { \  
1513 - if (x == (int8_t)0x80) { \  
1514 - x = 0x7f; \  
1515 - env->QF = 1; \  
1516 - } else if (x < 0) { \  
1517 - x = -x; \  
1518 - }} while (0)  
1519 -NEON_OP(qabs_s8)  
1520 -{  
1521 - neon_s8 vec;  
1522 - NEON_UNPACK(neon_s8, vec, T0);  
1523 - DO_QABS8(vec.v1);  
1524 - DO_QABS8(vec.v2);  
1525 - DO_QABS8(vec.v3);  
1526 - DO_QABS8(vec.v4);  
1527 - NEON_PACK(neon_s8, T0, vec);  
1528 - FORCE_RET();  
1529 -}  
1530 -#undef DO_QABS8  
1531 -  
1532 -#define DO_QNEG8(x) do { \  
1533 - if (x == (int8_t)0x80) { \  
1534 - x = 0x7f; \  
1535 - env->QF = 1; \  
1536 - } else { \  
1537 - x = -x; \  
1538 - }} while (0)  
1539 -NEON_OP(qneg_s8)  
1540 -{  
1541 - neon_s8 vec;  
1542 - NEON_UNPACK(neon_s8, vec, T0);  
1543 - DO_QNEG8(vec.v1);  
1544 - DO_QNEG8(vec.v2);  
1545 - DO_QNEG8(vec.v3);  
1546 - DO_QNEG8(vec.v4);  
1547 - NEON_PACK(neon_s8, T0, vec);  
1548 - FORCE_RET();  
1549 -}  
1550 -#undef DO_QNEG8  
1551 -  
1552 -#define DO_QABS16(x) do { \  
1553 - if (x == (int16_t)0x8000) { \  
1554 - x = 0x7fff; \  
1555 - env->QF = 1; \  
1556 - } else if (x < 0) { \  
1557 - x = -x; \  
1558 - }} while (0)  
1559 -NEON_OP(qabs_s16)  
1560 -{  
1561 - neon_s16 vec;  
1562 - NEON_UNPACK(neon_s16, vec, T0);  
1563 - DO_QABS16(vec.v1);  
1564 - DO_QABS16(vec.v2);  
1565 - NEON_PACK(neon_s16, T0, vec);  
1566 - FORCE_RET();  
1567 -}  
1568 -#undef DO_QABS16  
1569 -  
1570 -#define DO_QNEG16(x) do { \  
1571 - if (x == (int16_t)0x8000) { \  
1572 - x = 0x7fff; \  
1573 - env->QF = 1; \  
1574 - } else { \  
1575 - x = -x; \  
1576 - }} while (0)  
1577 -NEON_OP(qneg_s16)  
1578 -{  
1579 - neon_s16 vec;  
1580 - NEON_UNPACK(neon_s16, vec, T0);  
1581 - DO_QNEG16(vec.v1);  
1582 - DO_QNEG16(vec.v2);  
1583 - NEON_PACK(neon_s16, T0, vec);  
1584 - FORCE_RET();  
1585 -}  
1586 -#undef DO_QNEG16  
1587 -  
1588 -NEON_OP(qabs_s32)  
1589 -{  
1590 - if (T0 == 0x80000000) {  
1591 - T0 = 0x7fffffff;  
1592 - env->QF = 1;  
1593 - } else if ((int32_t)T0 < 0) {  
1594 - T0 = -T0;  
1595 - }  
1596 - FORCE_RET();  
1597 -}  
1598 -  
1599 -NEON_OP(qneg_s32)  
1600 -{  
1601 - if (T0 == 0x80000000) {  
1602 - T0 = 0x7fffffff;  
1603 - env->QF = 1;  
1604 - } else {  
1605 - T0 = -T0;  
1606 - }  
1607 - FORCE_RET();  
1608 -}  
1609 -  
1610 -/* Unary opperations */  
1611 -#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src  
1612 -NEON_VOP1(abs_s8, neon_s8, 4)  
1613 -NEON_VOP1(abs_s16, neon_s16, 2)  
1614 -NEON_OP(abs_s32)  
1615 -{  
1616 - if ((int32_t)T0 < 0)  
1617 - T0 = -T0;  
1618 - FORCE_RET();  
1619 -}  
1620 -#undef NEON_FN  
1621 -  
1622 -/* Transpose. Argument order is rather strange to avoid special casing  
1623 - the tranlation code.  
1624 - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */  
1625 -NEON_OP(trn_u8)  
1626 -{  
1627 - uint32_t rd;  
1628 - uint32_t rm;  
1629 - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);  
1630 - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);  
1631 - T0 = rd;  
1632 - T1 = rm;  
1633 - FORCE_RET();  
1634 -}  
1635 -  
1636 -NEON_OP(trn_u16)  
1637 -{  
1638 - uint32_t rd;  
1639 - uint32_t rm;  
1640 - rd = (T0 << 16) | (T1 & 0xffff);  
1641 - rm = (T1 >> 16) | (T0 & 0xffff0000);  
1642 - T0 = rd;  
1643 - T1 = rm;  
1644 - FORCE_RET();  
1645 -}  
1646 -  
1647 -/* Worker routines for zip and unzip. */  
1648 -NEON_OP(unzip_u8)  
1649 -{  
1650 - uint32_t rd;  
1651 - uint32_t rm;  
1652 - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)  
1653 - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);  
1654 - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)  
1655 - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);  
1656 - T0 = rd;  
1657 - T1 = rm;  
1658 - FORCE_RET();  
1659 -}  
1660 -  
1661 -NEON_OP(zip_u8)  
1662 -{  
1663 - uint32_t rd;  
1664 - uint32_t rm;  
1665 - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)  
1666 - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);  
1667 - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)  
1668 - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);  
1669 - T0 = rd;  
1670 - T1 = rm;  
1671 - FORCE_RET();  
1672 -}  
1673 -  
1674 -NEON_OP(zip_u16)  
1675 -{  
1676 - uint32_t tmp;  
1677 -  
1678 - tmp = (T0 & 0xffff) | (T1 << 16);  
1679 - T1 = (T1 & 0xffff0000) | (T0 >> 16);  
1680 - T0 = tmp;  
1681 - FORCE_RET();  
1682 -}  
1683 -  
1684 -NEON_OP(dup_u8)  
1685 -{  
1686 - T0 = (T0 >> PARAM1) & 0xff;  
1687 - T0 |= T0 << 8;  
1688 - T0 |= T0 << 16;  
1689 - FORCE_RET();  
1690 -}