Commit 2eb323c7b6da4c1e56ab2443b15ae9cb00460b06

Authored by Filip Navara
1 parent ecbfbbf4

Convert NEON VZIP/VUZP/VTRN helper functions to pure TCG

The neon_trn_u8, neon_trn_u16, neon_unzip_u8, neon_zip_u8 and neon_zip_u16
helpers used fixed registers to return values. This patch replaces that with
TCG code, so T0/T1 is no longer directly used by the helper functions.

Bugs in the gen_neon_unzip register load code were also fixed.

Signed-off-by: Filip Navara <filip.navara@gmail.com>
target-arm/helpers.h
... ... @@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32)
338 338 DEF_HELPER_2(neon_qneg_s16, i32, env, i32)
339 339 DEF_HELPER_2(neon_qneg_s32, i32, env, i32)
340 340  
341   -DEF_HELPER_0(neon_trn_u8, void)
342   -DEF_HELPER_0(neon_trn_u16, void)
343   -DEF_HELPER_0(neon_unzip_u8, void)
344   -DEF_HELPER_0(neon_zip_u8, void)
345   -DEF_HELPER_0(neon_zip_u16, void)
346   -
347 341 DEF_HELPER_2(neon_min_f32, i32, i32, i32)
348 342 DEF_HELPER_2(neon_max_f32, i32, i32, i32)
349 343 DEF_HELPER_2(neon_abd_f32, i32, i32, i32)
... ...
target-arm/op_helper.c
... ... @@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2)
495 495 }
496 496 return res;
497 497 }
498   -
499   -/* These need to return a pair of value, so still use T0/T1. */
500   -/* Transpose. Argument order is rather strange to avoid special casing
501   - the tranlation code.
502   - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
503   -void HELPER(neon_trn_u8)(void)
504   -{
505   - uint32_t rd;
506   - uint32_t rm;
507   - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
508   - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
509   - T0 = rd;
510   - T1 = rm;
511   -}
512   -
513   -void HELPER(neon_trn_u16)(void)
514   -{
515   - uint32_t rd;
516   - uint32_t rm;
517   - rd = (T0 << 16) | (T1 & 0xffff);
518   - rm = (T1 >> 16) | (T0 & 0xffff0000);
519   - T0 = rd;
520   - T1 = rm;
521   -}
522   -
523   -/* Worker routines for zip and unzip. */
524   -void HELPER(neon_unzip_u8)(void)
525   -{
526   - uint32_t rd;
527   - uint32_t rm;
528   - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
529   - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
530   - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
531   - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
532   - T0 = rd;
533   - T1 = rm;
534   -}
535   -
536   -void HELPER(neon_zip_u8)(void)
537   -{
538   - uint32_t rd;
539   - uint32_t rm;
540   - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
541   - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
542   - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
543   - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
544   - T0 = rd;
545   - T1 = rm;
546   -}
547   -
548   -void HELPER(neon_zip_u16)(void)
549   -{
550   - uint32_t tmp;
551   -
552   - tmp = (T0 & 0xffff) | (T1 << 16);
553   - T1 = (T1 & 0xffff0000) | (T0 >> 16);
554   - T0 = tmp;
555   -}
... ...
target-arm/translate.c
... ... @@ -3630,24 +3630,157 @@ static inline void gen_neon_get_scalar(int size, int reg)
3630 3630 }
3631 3631 }
3632 3632  
  3633 +static void gen_neon_unzip_u8(TCGv t0, TCGv t1)
  3634 +{
  3635 + TCGv rd, rm, tmp;
  3636 +
  3637 + rd = new_tmp();
  3638 + rm = new_tmp();
  3639 + tmp = new_tmp();
  3640 +
  3641 + tcg_gen_andi_i32(rd, t0, 0xff);
  3642 + tcg_gen_shri_i32(tmp, t0, 8);
  3643 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3644 + tcg_gen_or_i32(rd, rd, tmp);
  3645 + tcg_gen_shli_i32(tmp, t1, 16);
  3646 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3647 + tcg_gen_or_i32(rd, rd, tmp);
  3648 + tcg_gen_shli_i32(tmp, t1, 8);
  3649 + tcg_gen_andi_i32(tmp, tmp, 0xff000000);
  3650 + tcg_gen_or_i32(rd, rd, tmp);
  3651 +
  3652 + tcg_gen_shri_i32(rm, t0, 8);
  3653 + tcg_gen_andi_i32(rm, rm, 0xff);
  3654 + tcg_gen_shri_i32(tmp, t0, 16);
  3655 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3656 + tcg_gen_or_i32(rm, rm, tmp);
  3657 + tcg_gen_shli_i32(tmp, t1, 8);
  3658 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3659 + tcg_gen_or_i32(rm, rm, tmp);
  3660 + tcg_gen_andi_i32(tmp, t1, 0xff000000);
  3661 + tcg_gen_or_i32(t1, rm, tmp);
  3662 + tcg_gen_mov_i32(t0, rd);
  3663 +
  3664 + dead_tmp(tmp);
  3665 + dead_tmp(rm);
  3666 + dead_tmp(rd);
  3667 +}
  3668 +
  3669 +static void gen_neon_zip_u8(TCGv t0, TCGv t1)
  3670 +{
  3671 + TCGv rd, rm, tmp;
  3672 +
  3673 + rd = new_tmp();
  3674 + rm = new_tmp();
  3675 + tmp = new_tmp();
  3676 +
  3677 + tcg_gen_andi_i32(rd, t0, 0xff);
  3678 + tcg_gen_shli_i32(tmp, t1, 8);
  3679 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3680 + tcg_gen_or_i32(rd, rd, tmp);
  3681 + tcg_gen_shli_i32(tmp, t0, 16);
  3682 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3683 + tcg_gen_or_i32(rd, rd, tmp);
  3684 + tcg_gen_shli_i32(tmp, t1, 24);
  3685 + tcg_gen_andi_i32(tmp, tmp, 0xff000000);
  3686 + tcg_gen_or_i32(rd, rd, tmp);
  3687 +
  3688 + tcg_gen_andi_i32(rm, t1, 0xff000000);
  3689 + tcg_gen_shri_i32(tmp, t0, 8);
  3690 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3691 + tcg_gen_or_i32(rm, rm, tmp);
  3692 + tcg_gen_shri_i32(tmp, t1, 8);
  3693 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3694 + tcg_gen_or_i32(rm, rm, tmp);
  3695 + tcg_gen_shri_i32(tmp, t0, 16);
  3696 + tcg_gen_andi_i32(tmp, tmp, 0xff);
  3697 + tcg_gen_or_i32(t1, rm, tmp);
  3698 + tcg_gen_mov_i32(t0, rd);
  3699 +
  3700 + dead_tmp(tmp);
  3701 + dead_tmp(rm);
  3702 + dead_tmp(rd);
  3703 +}
  3704 +
  3705 +static void gen_neon_zip_u16(TCGv t0, TCGv t1)
  3706 +{
  3707 + TCGv tmp, tmp2;
  3708 +
  3709 + tmp = new_tmp();
  3710 + tmp2 = new_tmp();
  3711 +
  3712 + tcg_gen_andi_i32(tmp, t0, 0xffff);
  3713 + tcg_gen_shli_i32(tmp2, t1, 16);
  3714 + tcg_gen_or_i32(tmp, tmp, tmp2);
  3715 + tcg_gen_andi_i32(t1, t1, 0xffff0000);
  3716 + tcg_gen_shri_i32(tmp2, t0, 16);
  3717 + tcg_gen_or_i32(t1, t1, tmp2);
  3718 + tcg_gen_mov_i32(t0, tmp);
  3719 +
  3720 + dead_tmp(tmp2);
  3721 + dead_tmp(tmp);
  3722 +}
  3723 +
3633 3724 static void gen_neon_unzip(int reg, int q, int tmp, int size)
3634 3725 {
3635 3726 int n;
3636   -
  3727 +
3637 3728 for (n = 0; n < q + 1; n += 2) {
3638 3729 NEON_GET_REG(T0, reg, n);
3639   - NEON_GET_REG(T0, reg, n + n);
  3730 + NEON_GET_REG(T1, reg, n + 1);
3640 3731 switch (size) {
3641   - case 0: gen_helper_neon_unzip_u8(); break;
3642   - case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */
  3732 + case 0: gen_neon_unzip_u8(cpu_T[0], cpu_T[1]); break;
  3733 + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; /* zip and unzip are the same. */
3643 3734 case 2: /* no-op */; break;
3644 3735 default: abort();
3645 3736 }
3646   - gen_neon_movl_scratch_T0(tmp + n);
3647   - gen_neon_movl_scratch_T1(tmp + n + 1);
  3737 + gen_neon_movl_T0_scratch(tmp + n);
  3738 + gen_neon_movl_T1_scratch(tmp + n + 1);
3648 3739 }
3649 3740 }
3650 3741  
  3742 +static void gen_neon_trn_u8(TCGv t0, TCGv t1)
  3743 +{
  3744 + TCGv rd, tmp;
  3745 +
  3746 + rd = new_tmp();
  3747 + tmp = new_tmp();
  3748 +
  3749 + tcg_gen_shli_i32(rd, t0, 8);
  3750 + tcg_gen_andi_i32(rd, rd, 0xff00ff00);
  3751 + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
  3752 + tcg_gen_or_i32(rd, rd, tmp);
  3753 +
  3754 + tcg_gen_shri_i32(t1, t1, 8);
  3755 + tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
  3756 + tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
  3757 + tcg_gen_or_i32(t1, t1, tmp);
  3758 + tcg_gen_mov_i32(t0, rd);
  3759 +
  3760 + dead_tmp(tmp);
  3761 + dead_tmp(rd);
  3762 +}
  3763 +
  3764 +static void gen_neon_trn_u16(TCGv t0, TCGv t1)
  3765 +{
  3766 + TCGv rd, tmp;
  3767 +
  3768 + rd = new_tmp();
  3769 + tmp = new_tmp();
  3770 +
  3771 + tcg_gen_shli_i32(rd, t0, 16);
  3772 + tcg_gen_andi_i32(tmp, t1, 0xffff);
  3773 + tcg_gen_or_i32(rd, rd, tmp);
  3774 + tcg_gen_shri_i32(t1, t1, 16);
  3775 + tcg_gen_andi_i32(tmp, t0, 0xffff0000);
  3776 + tcg_gen_or_i32(t1, t1, tmp);
  3777 + tcg_gen_mov_i32(t0, rd);
  3778 +
  3779 + dead_tmp(tmp);
  3780 + dead_tmp(rd);
  3781 +}
  3782 +
  3783 +
3651 3784 static struct {
3652 3785 int nregs;
3653 3786 int interleave;
... ... @@ -5259,8 +5392,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5259 5392 NEON_GET_REG(T0, rd, n);
5260 5393 NEON_GET_REG(T1, rd, n);
5261 5394 switch (size) {
5262   - case 0: gen_helper_neon_zip_u8(); break;
5263   - case 1: gen_helper_neon_zip_u16(); break;
  5395 + case 0: gen_neon_zip_u8(cpu_T[0], cpu_T[1]); break;
  5396 + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break;
5264 5397 case 2: /* no-op */; break;
5265 5398 default: abort();
5266 5399 }
... ... @@ -5445,8 +5578,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5445 5578 case 33: /* VTRN */
5446 5579 NEON_GET_REG(T1, rd, pass);
5447 5580 switch (size) {
5448   - case 0: gen_helper_neon_trn_u8(); break;
5449   - case 1: gen_helper_neon_trn_u16(); break;
  5581 + case 0: gen_neon_trn_u8(cpu_T[0], cpu_T[1]); break;
  5582 + case 1: gen_neon_trn_u16(cpu_T[0], cpu_T[1]); break;
5450 5583 case 2: abort();
5451 5584 default: return 1;
5452 5585 }
... ...