Commit 2eb323c7b6da4c1e56ab2443b15ae9cb00460b06
1 parent
ecbfbbf4
Convert NEON VZIP/VUZP/VTRN helper functions to pure TCG
The neon_trn_u8, neon_trn_u16, neon_unzip_u8, neon_zip_u8 and neon_zip_u16 helpers used fixed registers to return values. This patch replaces that with TCG code, so T0/T1 is no longer directly used by the helper functions. Bugs in the gen_neon_unzip register load code were also fixed. Signed-off-by: Filip Navara <filip.navara@gmail.com>
Showing
3 changed files
with
143 additions
and
74 deletions
target-arm/helpers.h
| ... | ... | @@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32) |
| 338 | 338 | DEF_HELPER_2(neon_qneg_s16, i32, env, i32) |
| 339 | 339 | DEF_HELPER_2(neon_qneg_s32, i32, env, i32) |
| 340 | 340 | |
| 341 | -DEF_HELPER_0(neon_trn_u8, void) | |
| 342 | -DEF_HELPER_0(neon_trn_u16, void) | |
| 343 | -DEF_HELPER_0(neon_unzip_u8, void) | |
| 344 | -DEF_HELPER_0(neon_zip_u8, void) | |
| 345 | -DEF_HELPER_0(neon_zip_u16, void) | |
| 346 | - | |
| 347 | 341 | DEF_HELPER_2(neon_min_f32, i32, i32, i32) |
| 348 | 342 | DEF_HELPER_2(neon_max_f32, i32, i32, i32) |
| 349 | 343 | DEF_HELPER_2(neon_abd_f32, i32, i32, i32) | ... | ... |
target-arm/op_helper.c
| ... | ... | @@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2) |
| 495 | 495 | } |
| 496 | 496 | return res; |
| 497 | 497 | } |
| 498 | - | |
| 499 | -/* These need to return a pair of value, so still use T0/T1. */ | |
| 500 | -/* Transpose. Argument order is rather strange to avoid special casing | |
| 501 | - the tranlation code. | |
| 502 | - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */ | |
| 503 | -void HELPER(neon_trn_u8)(void) | |
| 504 | -{ | |
| 505 | - uint32_t rd; | |
| 506 | - uint32_t rm; | |
| 507 | - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff); | |
| 508 | - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00); | |
| 509 | - T0 = rd; | |
| 510 | - T1 = rm; | |
| 511 | -} | |
| 512 | - | |
| 513 | -void HELPER(neon_trn_u16)(void) | |
| 514 | -{ | |
| 515 | - uint32_t rd; | |
| 516 | - uint32_t rm; | |
| 517 | - rd = (T0 << 16) | (T1 & 0xffff); | |
| 518 | - rm = (T1 >> 16) | (T0 & 0xffff0000); | |
| 519 | - T0 = rd; | |
| 520 | - T1 = rm; | |
| 521 | -} | |
| 522 | - | |
| 523 | -/* Worker routines for zip and unzip. */ | |
| 524 | -void HELPER(neon_unzip_u8)(void) | |
| 525 | -{ | |
| 526 | - uint32_t rd; | |
| 527 | - uint32_t rm; | |
| 528 | - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00) | |
| 529 | - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000); | |
| 530 | - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) | |
| 531 | - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); | |
| 532 | - T0 = rd; | |
| 533 | - T1 = rm; | |
| 534 | -} | |
| 535 | - | |
| 536 | -void HELPER(neon_zip_u8)(void) | |
| 537 | -{ | |
| 538 | - uint32_t rd; | |
| 539 | - uint32_t rm; | |
| 540 | - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00) | |
| 541 | - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000); | |
| 542 | - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00) | |
| 543 | - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000); | |
| 544 | - T0 = rd; | |
| 545 | - T1 = rm; | |
| 546 | -} | |
| 547 | - | |
| 548 | -void HELPER(neon_zip_u16)(void) | |
| 549 | -{ | |
| 550 | - uint32_t tmp; | |
| 551 | - | |
| 552 | - tmp = (T0 & 0xffff) | (T1 << 16); | |
| 553 | - T1 = (T1 & 0xffff0000) | (T0 >> 16); | |
| 554 | - T0 = tmp; | |
| 555 | -} | ... | ... |
target-arm/translate.c
| ... | ... | @@ -3630,24 +3630,157 @@ static inline void gen_neon_get_scalar(int size, int reg) |
| 3630 | 3630 | } |
| 3631 | 3631 | } |
| 3632 | 3632 | |
| 3633 | +static void gen_neon_unzip_u8(TCGv t0, TCGv t1) | |
| 3634 | +{ | |
| 3635 | + TCGv rd, rm, tmp; | |
| 3636 | + | |
| 3637 | + rd = new_tmp(); | |
| 3638 | + rm = new_tmp(); | |
| 3639 | + tmp = new_tmp(); | |
| 3640 | + | |
| 3641 | + tcg_gen_andi_i32(rd, t0, 0xff); | |
| 3642 | + tcg_gen_shri_i32(tmp, t0, 8); | |
| 3643 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
| 3644 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3645 | + tcg_gen_shli_i32(tmp, t1, 16); | |
| 3646 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
| 3647 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3648 | + tcg_gen_shli_i32(tmp, t1, 8); | |
| 3649 | + tcg_gen_andi_i32(tmp, tmp, 0xff000000); | |
| 3650 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3651 | + | |
| 3652 | + tcg_gen_shri_i32(rm, t0, 8); | |
| 3653 | + tcg_gen_andi_i32(rm, rm, 0xff); | |
| 3654 | + tcg_gen_shri_i32(tmp, t0, 16); | |
| 3655 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
| 3656 | + tcg_gen_or_i32(rm, rm, tmp); | |
| 3657 | + tcg_gen_shli_i32(tmp, t1, 8); | |
| 3658 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
| 3659 | + tcg_gen_or_i32(rm, rm, tmp); | |
| 3660 | + tcg_gen_andi_i32(tmp, t1, 0xff000000); | |
| 3661 | + tcg_gen_or_i32(t1, rm, tmp); | |
| 3662 | + tcg_gen_mov_i32(t0, rd); | |
| 3663 | + | |
| 3664 | + dead_tmp(tmp); | |
| 3665 | + dead_tmp(rm); | |
| 3666 | + dead_tmp(rd); | |
| 3667 | +} | |
| 3668 | + | |
| 3669 | +static void gen_neon_zip_u8(TCGv t0, TCGv t1) | |
| 3670 | +{ | |
| 3671 | + TCGv rd, rm, tmp; | |
| 3672 | + | |
| 3673 | + rd = new_tmp(); | |
| 3674 | + rm = new_tmp(); | |
| 3675 | + tmp = new_tmp(); | |
| 3676 | + | |
| 3677 | + tcg_gen_andi_i32(rd, t0, 0xff); | |
| 3678 | + tcg_gen_shli_i32(tmp, t1, 8); | |
| 3679 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
| 3680 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3681 | + tcg_gen_shli_i32(tmp, t0, 16); | |
| 3682 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
| 3683 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3684 | + tcg_gen_shli_i32(tmp, t1, 24); | |
| 3685 | + tcg_gen_andi_i32(tmp, tmp, 0xff000000); | |
| 3686 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3687 | + | |
| 3688 | + tcg_gen_andi_i32(rm, t1, 0xff000000); | |
| 3689 | + tcg_gen_shri_i32(tmp, t0, 8); | |
| 3690 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
| 3691 | + tcg_gen_or_i32(rm, rm, tmp); | |
| 3692 | + tcg_gen_shri_i32(tmp, t1, 8); | |
| 3693 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
| 3694 | + tcg_gen_or_i32(rm, rm, tmp); | |
| 3695 | + tcg_gen_shri_i32(tmp, t0, 16); | |
| 3696 | + tcg_gen_andi_i32(tmp, tmp, 0xff); | |
| 3697 | + tcg_gen_or_i32(t1, rm, tmp); | |
| 3698 | + tcg_gen_mov_i32(t0, rd); | |
| 3699 | + | |
| 3700 | + dead_tmp(tmp); | |
| 3701 | + dead_tmp(rm); | |
| 3702 | + dead_tmp(rd); | |
| 3703 | +} | |
| 3704 | + | |
| 3705 | +static void gen_neon_zip_u16(TCGv t0, TCGv t1) | |
| 3706 | +{ | |
| 3707 | + TCGv tmp, tmp2; | |
| 3708 | + | |
| 3709 | + tmp = new_tmp(); | |
| 3710 | + tmp2 = new_tmp(); | |
| 3711 | + | |
| 3712 | + tcg_gen_andi_i32(tmp, t0, 0xffff); | |
| 3713 | + tcg_gen_shli_i32(tmp2, t1, 16); | |
| 3714 | + tcg_gen_or_i32(tmp, tmp, tmp2); | |
| 3715 | + tcg_gen_andi_i32(t1, t1, 0xffff0000); | |
| 3716 | + tcg_gen_shri_i32(tmp2, t0, 16); | |
| 3717 | + tcg_gen_or_i32(t1, t1, tmp2); | |
| 3718 | + tcg_gen_mov_i32(t0, tmp); | |
| 3719 | + | |
| 3720 | + dead_tmp(tmp2); | |
| 3721 | + dead_tmp(tmp); | |
| 3722 | +} | |
| 3723 | + | |
| 3633 | 3724 | static void gen_neon_unzip(int reg, int q, int tmp, int size) |
| 3634 | 3725 | { |
| 3635 | 3726 | int n; |
| 3636 | - | |
| 3727 | + | |
| 3637 | 3728 | for (n = 0; n < q + 1; n += 2) { |
| 3638 | 3729 | NEON_GET_REG(T0, reg, n); |
| 3639 | - NEON_GET_REG(T0, reg, n + n); | |
| 3730 | + NEON_GET_REG(T1, reg, n + 1); | |
| 3640 | 3731 | switch (size) { |
| 3641 | - case 0: gen_helper_neon_unzip_u8(); break; | |
| 3642 | - case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */ | |
| 3732 | + case 0: gen_neon_unzip_u8(cpu_T[0], cpu_T[1]); break; | |
| 3733 | + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; /* zip and unzip are the same. */ | |
| 3643 | 3734 | case 2: /* no-op */; break; |
| 3644 | 3735 | default: abort(); |
| 3645 | 3736 | } |
| 3646 | - gen_neon_movl_scratch_T0(tmp + n); | |
| 3647 | - gen_neon_movl_scratch_T1(tmp + n + 1); | |
| 3737 | + gen_neon_movl_T0_scratch(tmp + n); | |
| 3738 | + gen_neon_movl_T1_scratch(tmp + n + 1); | |
| 3648 | 3739 | } |
| 3649 | 3740 | } |
| 3650 | 3741 | |
| 3742 | +static void gen_neon_trn_u8(TCGv t0, TCGv t1) | |
| 3743 | +{ | |
| 3744 | + TCGv rd, tmp; | |
| 3745 | + | |
| 3746 | + rd = new_tmp(); | |
| 3747 | + tmp = new_tmp(); | |
| 3748 | + | |
| 3749 | + tcg_gen_shli_i32(rd, t0, 8); | |
| 3750 | + tcg_gen_andi_i32(rd, rd, 0xff00ff00); | |
| 3751 | + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff); | |
| 3752 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3753 | + | |
| 3754 | + tcg_gen_shri_i32(t1, t1, 8); | |
| 3755 | + tcg_gen_andi_i32(t1, t1, 0x00ff00ff); | |
| 3756 | + tcg_gen_andi_i32(tmp, t0, 0xff00ff00); | |
| 3757 | + tcg_gen_or_i32(t1, t1, tmp); | |
| 3758 | + tcg_gen_mov_i32(t0, rd); | |
| 3759 | + | |
| 3760 | + dead_tmp(tmp); | |
| 3761 | + dead_tmp(rd); | |
| 3762 | +} | |
| 3763 | + | |
| 3764 | +static void gen_neon_trn_u16(TCGv t0, TCGv t1) | |
| 3765 | +{ | |
| 3766 | + TCGv rd, tmp; | |
| 3767 | + | |
| 3768 | + rd = new_tmp(); | |
| 3769 | + tmp = new_tmp(); | |
| 3770 | + | |
| 3771 | + tcg_gen_shli_i32(rd, t0, 16); | |
| 3772 | + tcg_gen_andi_i32(tmp, t1, 0xffff); | |
| 3773 | + tcg_gen_or_i32(rd, rd, tmp); | |
| 3774 | + tcg_gen_shri_i32(t1, t1, 16); | |
| 3775 | + tcg_gen_andi_i32(tmp, t0, 0xffff0000); | |
| 3776 | + tcg_gen_or_i32(t1, t1, tmp); | |
| 3777 | + tcg_gen_mov_i32(t0, rd); | |
| 3778 | + | |
| 3779 | + dead_tmp(tmp); | |
| 3780 | + dead_tmp(rd); | |
| 3781 | +} | |
| 3782 | + | |
| 3783 | + | |
| 3651 | 3784 | static struct { |
| 3652 | 3785 | int nregs; |
| 3653 | 3786 | int interleave; |
| ... | ... | @@ -5259,8 +5392,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) |
| 5259 | 5392 | NEON_GET_REG(T0, rd, n); |
| 5260 | 5393 | NEON_GET_REG(T1, rd, n); |
| 5261 | 5394 | switch (size) { |
| 5262 | - case 0: gen_helper_neon_zip_u8(); break; | |
| 5263 | - case 1: gen_helper_neon_zip_u16(); break; | |
| 5395 | + case 0: gen_neon_zip_u8(cpu_T[0], cpu_T[1]); break; | |
| 5396 | + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; | |
| 5264 | 5397 | case 2: /* no-op */; break; |
| 5265 | 5398 | default: abort(); |
| 5266 | 5399 | } |
| ... | ... | @@ -5445,8 +5578,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) |
| 5445 | 5578 | case 33: /* VTRN */ |
| 5446 | 5579 | NEON_GET_REG(T1, rd, pass); |
| 5447 | 5580 | switch (size) { |
| 5448 | - case 0: gen_helper_neon_trn_u8(); break; | |
| 5449 | - case 1: gen_helper_neon_trn_u16(); break; | |
| 5581 | + case 0: gen_neon_trn_u8(cpu_T[0], cpu_T[1]); break; | |
| 5582 | + case 1: gen_neon_trn_u16(cpu_T[0], cpu_T[1]); break; | |
| 5450 | 5583 | case 2: abort(); |
| 5451 | 5584 | default: return 1; |
| 5452 | 5585 | } | ... | ... |