Commit 2eb323c7b6da4c1e56ab2443b15ae9cb00460b06
1 parent
ecbfbbf4
Convert NEON VZIP/VUZP/VTRN helper functions to pure TCG
The neon_trn_u8, neon_trn_u16, neon_unzip_u8, neon_zip_u8 and neon_zip_u16 helpers used fixed registers to return values. This patch replaces that with TCG code, so T0/T1 is no longer directly used by the helper functions. Bugs in the gen_neon_unzip register load code were also fixed. Signed-off-by: Filip Navara <filip.navara@gmail.com>
Showing
3 changed files
with
143 additions
and
74 deletions
target-arm/helpers.h
@@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32) | @@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32) | ||
338 | DEF_HELPER_2(neon_qneg_s16, i32, env, i32) | 338 | DEF_HELPER_2(neon_qneg_s16, i32, env, i32) |
339 | DEF_HELPER_2(neon_qneg_s32, i32, env, i32) | 339 | DEF_HELPER_2(neon_qneg_s32, i32, env, i32) |
340 | 340 | ||
341 | -DEF_HELPER_0(neon_trn_u8, void) | ||
342 | -DEF_HELPER_0(neon_trn_u16, void) | ||
343 | -DEF_HELPER_0(neon_unzip_u8, void) | ||
344 | -DEF_HELPER_0(neon_zip_u8, void) | ||
345 | -DEF_HELPER_0(neon_zip_u16, void) | ||
346 | - | ||
347 | DEF_HELPER_2(neon_min_f32, i32, i32, i32) | 341 | DEF_HELPER_2(neon_min_f32, i32, i32, i32) |
348 | DEF_HELPER_2(neon_max_f32, i32, i32, i32) | 342 | DEF_HELPER_2(neon_max_f32, i32, i32, i32) |
349 | DEF_HELPER_2(neon_abd_f32, i32, i32, i32) | 343 | DEF_HELPER_2(neon_abd_f32, i32, i32, i32) |
target-arm/op_helper.c
@@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2) | @@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2) | ||
495 | } | 495 | } |
496 | return res; | 496 | return res; |
497 | } | 497 | } |
498 | - | ||
499 | -/* These need to return a pair of value, so still use T0/T1. */ | ||
500 | -/* Transpose. Argument order is rather strange to avoid special casing | ||
501 | - the tranlation code. | ||
502 | - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */ | ||
503 | -void HELPER(neon_trn_u8)(void) | ||
504 | -{ | ||
505 | - uint32_t rd; | ||
506 | - uint32_t rm; | ||
507 | - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff); | ||
508 | - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00); | ||
509 | - T0 = rd; | ||
510 | - T1 = rm; | ||
511 | -} | ||
512 | - | ||
513 | -void HELPER(neon_trn_u16)(void) | ||
514 | -{ | ||
515 | - uint32_t rd; | ||
516 | - uint32_t rm; | ||
517 | - rd = (T0 << 16) | (T1 & 0xffff); | ||
518 | - rm = (T1 >> 16) | (T0 & 0xffff0000); | ||
519 | - T0 = rd; | ||
520 | - T1 = rm; | ||
521 | -} | ||
522 | - | ||
523 | -/* Worker routines for zip and unzip. */ | ||
524 | -void HELPER(neon_unzip_u8)(void) | ||
525 | -{ | ||
526 | - uint32_t rd; | ||
527 | - uint32_t rm; | ||
528 | - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00) | ||
529 | - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000); | ||
530 | - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) | ||
531 | - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); | ||
532 | - T0 = rd; | ||
533 | - T1 = rm; | ||
534 | -} | ||
535 | - | ||
536 | -void HELPER(neon_zip_u8)(void) | ||
537 | -{ | ||
538 | - uint32_t rd; | ||
539 | - uint32_t rm; | ||
540 | - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00) | ||
541 | - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000); | ||
542 | - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00) | ||
543 | - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000); | ||
544 | - T0 = rd; | ||
545 | - T1 = rm; | ||
546 | -} | ||
547 | - | ||
548 | -void HELPER(neon_zip_u16)(void) | ||
549 | -{ | ||
550 | - uint32_t tmp; | ||
551 | - | ||
552 | - tmp = (T0 & 0xffff) | (T1 << 16); | ||
553 | - T1 = (T1 & 0xffff0000) | (T0 >> 16); | ||
554 | - T0 = tmp; | ||
555 | -} |
target-arm/translate.c
@@ -3630,24 +3630,157 @@ static inline void gen_neon_get_scalar(int size, int reg) | @@ -3630,24 +3630,157 @@ static inline void gen_neon_get_scalar(int size, int reg) | ||
3630 | } | 3630 | } |
3631 | } | 3631 | } |
3632 | 3632 | ||
3633 | +static void gen_neon_unzip_u8(TCGv t0, TCGv t1) | ||
3634 | +{ | ||
3635 | + TCGv rd, rm, tmp; | ||
3636 | + | ||
3637 | + rd = new_tmp(); | ||
3638 | + rm = new_tmp(); | ||
3639 | + tmp = new_tmp(); | ||
3640 | + | ||
3641 | + tcg_gen_andi_i32(rd, t0, 0xff); | ||
3642 | + tcg_gen_shri_i32(tmp, t0, 8); | ||
3643 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | ||
3644 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3645 | + tcg_gen_shli_i32(tmp, t1, 16); | ||
3646 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | ||
3647 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3648 | + tcg_gen_shli_i32(tmp, t1, 8); | ||
3649 | + tcg_gen_andi_i32(tmp, tmp, 0xff000000); | ||
3650 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3651 | + | ||
3652 | + tcg_gen_shri_i32(rm, t0, 8); | ||
3653 | + tcg_gen_andi_i32(rm, rm, 0xff); | ||
3654 | + tcg_gen_shri_i32(tmp, t0, 16); | ||
3655 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | ||
3656 | + tcg_gen_or_i32(rm, rm, tmp); | ||
3657 | + tcg_gen_shli_i32(tmp, t1, 8); | ||
3658 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | ||
3659 | + tcg_gen_or_i32(rm, rm, tmp); | ||
3660 | + tcg_gen_andi_i32(tmp, t1, 0xff000000); | ||
3661 | + tcg_gen_or_i32(t1, rm, tmp); | ||
3662 | + tcg_gen_mov_i32(t0, rd); | ||
3663 | + | ||
3664 | + dead_tmp(tmp); | ||
3665 | + dead_tmp(rm); | ||
3666 | + dead_tmp(rd); | ||
3667 | +} | ||
3668 | + | ||
3669 | +static void gen_neon_zip_u8(TCGv t0, TCGv t1) | ||
3670 | +{ | ||
3671 | + TCGv rd, rm, tmp; | ||
3672 | + | ||
3673 | + rd = new_tmp(); | ||
3674 | + rm = new_tmp(); | ||
3675 | + tmp = new_tmp(); | ||
3676 | + | ||
3677 | + tcg_gen_andi_i32(rd, t0, 0xff); | ||
3678 | + tcg_gen_shli_i32(tmp, t1, 8); | ||
3679 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | ||
3680 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3681 | + tcg_gen_shli_i32(tmp, t0, 16); | ||
3682 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | ||
3683 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3684 | + tcg_gen_shli_i32(tmp, t1, 24); | ||
3685 | + tcg_gen_andi_i32(tmp, tmp, 0xff000000); | ||
3686 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3687 | + | ||
3688 | + tcg_gen_andi_i32(rm, t1, 0xff000000); | ||
3689 | + tcg_gen_shri_i32(tmp, t0, 8); | ||
3690 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | ||
3691 | + tcg_gen_or_i32(rm, rm, tmp); | ||
3692 | + tcg_gen_shri_i32(tmp, t1, 8); | ||
3693 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | ||
3694 | + tcg_gen_or_i32(rm, rm, tmp); | ||
3695 | + tcg_gen_shri_i32(tmp, t0, 16); | ||
3696 | + tcg_gen_andi_i32(tmp, tmp, 0xff); | ||
3697 | + tcg_gen_or_i32(t1, rm, tmp); | ||
3698 | + tcg_gen_mov_i32(t0, rd); | ||
3699 | + | ||
3700 | + dead_tmp(tmp); | ||
3701 | + dead_tmp(rm); | ||
3702 | + dead_tmp(rd); | ||
3703 | +} | ||
3704 | + | ||
3705 | +static void gen_neon_zip_u16(TCGv t0, TCGv t1) | ||
3706 | +{ | ||
3707 | + TCGv tmp, tmp2; | ||
3708 | + | ||
3709 | + tmp = new_tmp(); | ||
3710 | + tmp2 = new_tmp(); | ||
3711 | + | ||
3712 | + tcg_gen_andi_i32(tmp, t0, 0xffff); | ||
3713 | + tcg_gen_shli_i32(tmp2, t1, 16); | ||
3714 | + tcg_gen_or_i32(tmp, tmp, tmp2); | ||
3715 | + tcg_gen_andi_i32(t1, t1, 0xffff0000); | ||
3716 | + tcg_gen_shri_i32(tmp2, t0, 16); | ||
3717 | + tcg_gen_or_i32(t1, t1, tmp2); | ||
3718 | + tcg_gen_mov_i32(t0, tmp); | ||
3719 | + | ||
3720 | + dead_tmp(tmp2); | ||
3721 | + dead_tmp(tmp); | ||
3722 | +} | ||
3723 | + | ||
3633 | static void gen_neon_unzip(int reg, int q, int tmp, int size) | 3724 | static void gen_neon_unzip(int reg, int q, int tmp, int size) |
3634 | { | 3725 | { |
3635 | int n; | 3726 | int n; |
3636 | - | 3727 | + |
3637 | for (n = 0; n < q + 1; n += 2) { | 3728 | for (n = 0; n < q + 1; n += 2) { |
3638 | NEON_GET_REG(T0, reg, n); | 3729 | NEON_GET_REG(T0, reg, n); |
3639 | - NEON_GET_REG(T0, reg, n + n); | 3730 | + NEON_GET_REG(T1, reg, n + 1); |
3640 | switch (size) { | 3731 | switch (size) { |
3641 | - case 0: gen_helper_neon_unzip_u8(); break; | ||
3642 | - case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */ | 3732 | + case 0: gen_neon_unzip_u8(cpu_T[0], cpu_T[1]); break; |
3733 | + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; /* zip and unzip are the same. */ | ||
3643 | case 2: /* no-op */; break; | 3734 | case 2: /* no-op */; break; |
3644 | default: abort(); | 3735 | default: abort(); |
3645 | } | 3736 | } |
3646 | - gen_neon_movl_scratch_T0(tmp + n); | ||
3647 | - gen_neon_movl_scratch_T1(tmp + n + 1); | 3737 | + gen_neon_movl_T0_scratch(tmp + n); |
3738 | + gen_neon_movl_T1_scratch(tmp + n + 1); | ||
3648 | } | 3739 | } |
3649 | } | 3740 | } |
3650 | 3741 | ||
3742 | +static void gen_neon_trn_u8(TCGv t0, TCGv t1) | ||
3743 | +{ | ||
3744 | + TCGv rd, tmp; | ||
3745 | + | ||
3746 | + rd = new_tmp(); | ||
3747 | + tmp = new_tmp(); | ||
3748 | + | ||
3749 | + tcg_gen_shli_i32(rd, t0, 8); | ||
3750 | + tcg_gen_andi_i32(rd, rd, 0xff00ff00); | ||
3751 | + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff); | ||
3752 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3753 | + | ||
3754 | + tcg_gen_shri_i32(t1, t1, 8); | ||
3755 | + tcg_gen_andi_i32(t1, t1, 0x00ff00ff); | ||
3756 | + tcg_gen_andi_i32(tmp, t0, 0xff00ff00); | ||
3757 | + tcg_gen_or_i32(t1, t1, tmp); | ||
3758 | + tcg_gen_mov_i32(t0, rd); | ||
3759 | + | ||
3760 | + dead_tmp(tmp); | ||
3761 | + dead_tmp(rd); | ||
3762 | +} | ||
3763 | + | ||
3764 | +static void gen_neon_trn_u16(TCGv t0, TCGv t1) | ||
3765 | +{ | ||
3766 | + TCGv rd, tmp; | ||
3767 | + | ||
3768 | + rd = new_tmp(); | ||
3769 | + tmp = new_tmp(); | ||
3770 | + | ||
3771 | + tcg_gen_shli_i32(rd, t0, 16); | ||
3772 | + tcg_gen_andi_i32(tmp, t1, 0xffff); | ||
3773 | + tcg_gen_or_i32(rd, rd, tmp); | ||
3774 | + tcg_gen_shri_i32(t1, t1, 16); | ||
3775 | + tcg_gen_andi_i32(tmp, t0, 0xffff0000); | ||
3776 | + tcg_gen_or_i32(t1, t1, tmp); | ||
3777 | + tcg_gen_mov_i32(t0, rd); | ||
3778 | + | ||
3779 | + dead_tmp(tmp); | ||
3780 | + dead_tmp(rd); | ||
3781 | +} | ||
3782 | + | ||
3783 | + | ||
3651 | static struct { | 3784 | static struct { |
3652 | int nregs; | 3785 | int nregs; |
3653 | int interleave; | 3786 | int interleave; |
@@ -5259,8 +5392,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) | @@ -5259,8 +5392,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) | ||
5259 | NEON_GET_REG(T0, rd, n); | 5392 | NEON_GET_REG(T0, rd, n); |
5260 | NEON_GET_REG(T1, rd, n); | 5393 | NEON_GET_REG(T1, rd, n); |
5261 | switch (size) { | 5394 | switch (size) { |
5262 | - case 0: gen_helper_neon_zip_u8(); break; | ||
5263 | - case 1: gen_helper_neon_zip_u16(); break; | 5395 | + case 0: gen_neon_zip_u8(cpu_T[0], cpu_T[1]); break; |
5396 | + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; | ||
5264 | case 2: /* no-op */; break; | 5397 | case 2: /* no-op */; break; |
5265 | default: abort(); | 5398 | default: abort(); |
5266 | } | 5399 | } |
@@ -5445,8 +5578,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) | @@ -5445,8 +5578,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) | ||
5445 | case 33: /* VTRN */ | 5578 | case 33: /* VTRN */ |
5446 | NEON_GET_REG(T1, rd, pass); | 5579 | NEON_GET_REG(T1, rd, pass); |
5447 | switch (size) { | 5580 | switch (size) { |
5448 | - case 0: gen_helper_neon_trn_u8(); break; | ||
5449 | - case 1: gen_helper_neon_trn_u16(); break; | 5581 | + case 0: gen_neon_trn_u8(cpu_T[0], cpu_T[1]); break; |
5582 | + case 1: gen_neon_trn_u16(cpu_T[0], cpu_T[1]); break; | ||
5450 | case 2: abort(); | 5583 | case 2: abort(); |
5451 | default: return 1; | 5584 | default: return 1; |
5452 | } | 5585 | } |