Commit 2eb323c7b6da4c1e56ab2443b15ae9cb00460b06

Authored by Filip Navara
1 parent ecbfbbf4

Convert NEON VZIP/VUZP/VTRN helper functions to pure TCG

The neon_trn_u8, neon_trn_u16, neon_unzip_u8, neon_zip_u8 and neon_zip_u16
helpers used fixed registers to return values. This patch replaces that with
TCG code, so T0/T1 is no longer directly used by the helper functions.

Bugs in the gen_neon_unzip register load code were also fixed.

Signed-off-by: Filip Navara <filip.navara@gmail.com>
target-arm/helpers.h
@@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32) @@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32)
338 DEF_HELPER_2(neon_qneg_s16, i32, env, i32) 338 DEF_HELPER_2(neon_qneg_s16, i32, env, i32)
339 DEF_HELPER_2(neon_qneg_s32, i32, env, i32) 339 DEF_HELPER_2(neon_qneg_s32, i32, env, i32)
340 340
341 -DEF_HELPER_0(neon_trn_u8, void)  
342 -DEF_HELPER_0(neon_trn_u16, void)  
343 -DEF_HELPER_0(neon_unzip_u8, void)  
344 -DEF_HELPER_0(neon_zip_u8, void)  
345 -DEF_HELPER_0(neon_zip_u16, void)  
346 -  
347 DEF_HELPER_2(neon_min_f32, i32, i32, i32) 341 DEF_HELPER_2(neon_min_f32, i32, i32, i32)
348 DEF_HELPER_2(neon_max_f32, i32, i32, i32) 342 DEF_HELPER_2(neon_max_f32, i32, i32, i32)
349 DEF_HELPER_2(neon_abd_f32, i32, i32, i32) 343 DEF_HELPER_2(neon_abd_f32, i32, i32, i32)
target-arm/op_helper.c
@@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2) @@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2)
495 } 495 }
496 return res; 496 return res;
497 } 497 }
498 -  
499 -/* These need to return a pair of value, so still use T0/T1. */  
500 -/* Transpose. Argument order is rather strange to avoid special casing  
501 - the tranlation code.  
502 - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */  
503 -void HELPER(neon_trn_u8)(void)  
504 -{  
505 - uint32_t rd;  
506 - uint32_t rm;  
507 - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);  
508 - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);  
509 - T0 = rd;  
510 - T1 = rm;  
511 -}  
512 -  
513 -void HELPER(neon_trn_u16)(void)  
514 -{  
515 - uint32_t rd;  
516 - uint32_t rm;  
517 - rd = (T0 << 16) | (T1 & 0xffff);  
518 - rm = (T1 >> 16) | (T0 & 0xffff0000);  
519 - T0 = rd;  
520 - T1 = rm;  
521 -}  
522 -  
523 -/* Worker routines for zip and unzip. */  
524 -void HELPER(neon_unzip_u8)(void)  
525 -{  
526 - uint32_t rd;  
527 - uint32_t rm;  
528 - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)  
529 - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);  
530 - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)  
531 - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);  
532 - T0 = rd;  
533 - T1 = rm;  
534 -}  
535 -  
536 -void HELPER(neon_zip_u8)(void)  
537 -{  
538 - uint32_t rd;  
539 - uint32_t rm;  
540 - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)  
541 - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);  
542 - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)  
543 - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);  
544 - T0 = rd;  
545 - T1 = rm;  
546 -}  
547 -  
548 -void HELPER(neon_zip_u16)(void)  
549 -{  
550 - uint32_t tmp;  
551 -  
552 - tmp = (T0 & 0xffff) | (T1 << 16);  
553 - T1 = (T1 & 0xffff0000) | (T0 >> 16);  
554 - T0 = tmp;  
555 -}  
target-arm/translate.c
@@ -3630,24 +3630,157 @@ static inline void gen_neon_get_scalar(int size, int reg) @@ -3630,24 +3630,157 @@ static inline void gen_neon_get_scalar(int size, int reg)
3630 } 3630 }
3631 } 3631 }
3632 3632
  3633 +static void gen_neon_unzip_u8(TCGv t0, TCGv t1)
  3634 +{
  3635 + TCGv rd, rm, tmp;
  3636 +
  3637 + rd = new_tmp();
  3638 + rm = new_tmp();
  3639 + tmp = new_tmp();
  3640 +
  3641 + tcg_gen_andi_i32(rd, t0, 0xff);
  3642 + tcg_gen_shri_i32(tmp, t0, 8);
  3643 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3644 + tcg_gen_or_i32(rd, rd, tmp);
  3645 + tcg_gen_shli_i32(tmp, t1, 16);
  3646 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3647 + tcg_gen_or_i32(rd, rd, tmp);
  3648 + tcg_gen_shli_i32(tmp, t1, 8);
  3649 + tcg_gen_andi_i32(tmp, tmp, 0xff000000);
  3650 + tcg_gen_or_i32(rd, rd, tmp);
  3651 +
  3652 + tcg_gen_shri_i32(rm, t0, 8);
  3653 + tcg_gen_andi_i32(rm, rm, 0xff);
  3654 + tcg_gen_shri_i32(tmp, t0, 16);
  3655 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3656 + tcg_gen_or_i32(rm, rm, tmp);
  3657 + tcg_gen_shli_i32(tmp, t1, 8);
  3658 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3659 + tcg_gen_or_i32(rm, rm, tmp);
  3660 + tcg_gen_andi_i32(tmp, t1, 0xff000000);
  3661 + tcg_gen_or_i32(t1, rm, tmp);
  3662 + tcg_gen_mov_i32(t0, rd);
  3663 +
  3664 + dead_tmp(tmp);
  3665 + dead_tmp(rm);
  3666 + dead_tmp(rd);
  3667 +}
  3668 +
  3669 +static void gen_neon_zip_u8(TCGv t0, TCGv t1)
  3670 +{
  3671 + TCGv rd, rm, tmp;
  3672 +
  3673 + rd = new_tmp();
  3674 + rm = new_tmp();
  3675 + tmp = new_tmp();
  3676 +
  3677 + tcg_gen_andi_i32(rd, t0, 0xff);
  3678 + tcg_gen_shli_i32(tmp, t1, 8);
  3679 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3680 + tcg_gen_or_i32(rd, rd, tmp);
  3681 + tcg_gen_shli_i32(tmp, t0, 16);
  3682 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3683 + tcg_gen_or_i32(rd, rd, tmp);
  3684 + tcg_gen_shli_i32(tmp, t1, 24);
  3685 + tcg_gen_andi_i32(tmp, tmp, 0xff000000);
  3686 + tcg_gen_or_i32(rd, rd, tmp);
  3687 +
  3688 + tcg_gen_andi_i32(rm, t1, 0xff000000);
  3689 + tcg_gen_shri_i32(tmp, t0, 8);
  3690 + tcg_gen_andi_i32(tmp, tmp, 0xff0000);
  3691 + tcg_gen_or_i32(rm, rm, tmp);
  3692 + tcg_gen_shri_i32(tmp, t1, 8);
  3693 + tcg_gen_andi_i32(tmp, tmp, 0xff00);
  3694 + tcg_gen_or_i32(rm, rm, tmp);
  3695 + tcg_gen_shri_i32(tmp, t0, 16);
  3696 + tcg_gen_andi_i32(tmp, tmp, 0xff);
  3697 + tcg_gen_or_i32(t1, rm, tmp);
  3698 + tcg_gen_mov_i32(t0, rd);
  3699 +
  3700 + dead_tmp(tmp);
  3701 + dead_tmp(rm);
  3702 + dead_tmp(rd);
  3703 +}
  3704 +
  3705 +static void gen_neon_zip_u16(TCGv t0, TCGv t1)
  3706 +{
  3707 + TCGv tmp, tmp2;
  3708 +
  3709 + tmp = new_tmp();
  3710 + tmp2 = new_tmp();
  3711 +
  3712 + tcg_gen_andi_i32(tmp, t0, 0xffff);
  3713 + tcg_gen_shli_i32(tmp2, t1, 16);
  3714 + tcg_gen_or_i32(tmp, tmp, tmp2);
  3715 + tcg_gen_andi_i32(t1, t1, 0xffff0000);
  3716 + tcg_gen_shri_i32(tmp2, t0, 16);
  3717 + tcg_gen_or_i32(t1, t1, tmp2);
  3718 + tcg_gen_mov_i32(t0, tmp);
  3719 +
  3720 + dead_tmp(tmp2);
  3721 + dead_tmp(tmp);
  3722 +}
  3723 +
3633 static void gen_neon_unzip(int reg, int q, int tmp, int size) 3724 static void gen_neon_unzip(int reg, int q, int tmp, int size)
3634 { 3725 {
3635 int n; 3726 int n;
3636 - 3727 +
3637 for (n = 0; n < q + 1; n += 2) { 3728 for (n = 0; n < q + 1; n += 2) {
3638 NEON_GET_REG(T0, reg, n); 3729 NEON_GET_REG(T0, reg, n);
3639 - NEON_GET_REG(T0, reg, n + n); 3730 + NEON_GET_REG(T1, reg, n + 1);
3640 switch (size) { 3731 switch (size) {
3641 - case 0: gen_helper_neon_unzip_u8(); break;  
3642 - case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */ 3732 + case 0: gen_neon_unzip_u8(cpu_T[0], cpu_T[1]); break;
  3733 + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; /* zip and unzip are the same. */
3643 case 2: /* no-op */; break; 3734 case 2: /* no-op */; break;
3644 default: abort(); 3735 default: abort();
3645 } 3736 }
3646 - gen_neon_movl_scratch_T0(tmp + n);  
3647 - gen_neon_movl_scratch_T1(tmp + n + 1); 3737 + gen_neon_movl_T0_scratch(tmp + n);
  3738 + gen_neon_movl_T1_scratch(tmp + n + 1);
3648 } 3739 }
3649 } 3740 }
3650 3741
  3742 +static void gen_neon_trn_u8(TCGv t0, TCGv t1)
  3743 +{
  3744 + TCGv rd, tmp;
  3745 +
  3746 + rd = new_tmp();
  3747 + tmp = new_tmp();
  3748 +
  3749 + tcg_gen_shli_i32(rd, t0, 8);
  3750 + tcg_gen_andi_i32(rd, rd, 0xff00ff00);
  3751 + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
  3752 + tcg_gen_or_i32(rd, rd, tmp);
  3753 +
  3754 + tcg_gen_shri_i32(t1, t1, 8);
  3755 + tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
  3756 + tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
  3757 + tcg_gen_or_i32(t1, t1, tmp);
  3758 + tcg_gen_mov_i32(t0, rd);
  3759 +
  3760 + dead_tmp(tmp);
  3761 + dead_tmp(rd);
  3762 +}
  3763 +
  3764 +static void gen_neon_trn_u16(TCGv t0, TCGv t1)
  3765 +{
  3766 + TCGv rd, tmp;
  3767 +
  3768 + rd = new_tmp();
  3769 + tmp = new_tmp();
  3770 +
  3771 + tcg_gen_shli_i32(rd, t0, 16);
  3772 + tcg_gen_andi_i32(tmp, t1, 0xffff);
  3773 + tcg_gen_or_i32(rd, rd, tmp);
  3774 + tcg_gen_shri_i32(t1, t1, 16);
  3775 + tcg_gen_andi_i32(tmp, t0, 0xffff0000);
  3776 + tcg_gen_or_i32(t1, t1, tmp);
  3777 + tcg_gen_mov_i32(t0, rd);
  3778 +
  3779 + dead_tmp(tmp);
  3780 + dead_tmp(rd);
  3781 +}
  3782 +
  3783 +
3651 static struct { 3784 static struct {
3652 int nregs; 3785 int nregs;
3653 int interleave; 3786 int interleave;
@@ -5259,8 +5392,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5259,8 +5392,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5259 NEON_GET_REG(T0, rd, n); 5392 NEON_GET_REG(T0, rd, n);
5260 NEON_GET_REG(T1, rd, n); 5393 NEON_GET_REG(T1, rd, n);
5261 switch (size) { 5394 switch (size) {
5262 - case 0: gen_helper_neon_zip_u8(); break;  
5263 - case 1: gen_helper_neon_zip_u16(); break; 5395 + case 0: gen_neon_zip_u8(cpu_T[0], cpu_T[1]); break;
  5396 + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break;
5264 case 2: /* no-op */; break; 5397 case 2: /* no-op */; break;
5265 default: abort(); 5398 default: abort();
5266 } 5399 }
@@ -5445,8 +5578,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -5445,8 +5578,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
5445 case 33: /* VTRN */ 5578 case 33: /* VTRN */
5446 NEON_GET_REG(T1, rd, pass); 5579 NEON_GET_REG(T1, rd, pass);
5447 switch (size) { 5580 switch (size) {
5448 - case 0: gen_helper_neon_trn_u8(); break;  
5449 - case 1: gen_helper_neon_trn_u16(); break; 5581 + case 0: gen_neon_trn_u8(cpu_T[0], cpu_T[1]); break;
  5582 + case 1: gen_neon_trn_u16(cpu_T[0], cpu_T[1]); break;
5450 case 2: abort(); 5583 case 2: abort();
5451 default: return 1; 5584 default: return 1;
5452 } 5585 }