Commit 2eb323c7b6da4c1e56ab2443b15ae9cb00460b06
1 parent
ecbfbbf4
Convert NEON VZIP/VUZP/VTRN helper functions to pure TCG
The neon_trn_u8, neon_trn_u16, neon_unzip_u8, neon_zip_u8 and neon_zip_u16 helpers used fixed registers to return values. This patch replaces that with TCG code, so T0/T1 is no longer directly used by the helper functions. Bugs in the gen_neon_unzip register load code were also fixed. Signed-off-by: Filip Navara <filip.navara@gmail.com>
Showing
3 changed files
with
143 additions
and
74 deletions
target-arm/helpers.h
... | ... | @@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32) |
338 | 338 | DEF_HELPER_2(neon_qneg_s16, i32, env, i32) |
339 | 339 | DEF_HELPER_2(neon_qneg_s32, i32, env, i32) |
340 | 340 | |
341 | -DEF_HELPER_0(neon_trn_u8, void) | |
342 | -DEF_HELPER_0(neon_trn_u16, void) | |
343 | -DEF_HELPER_0(neon_unzip_u8, void) | |
344 | -DEF_HELPER_0(neon_zip_u8, void) | |
345 | -DEF_HELPER_0(neon_zip_u16, void) | |
346 | - | |
347 | 341 | DEF_HELPER_2(neon_min_f32, i32, i32, i32) |
348 | 342 | DEF_HELPER_2(neon_max_f32, i32, i32, i32) |
349 | 343 | DEF_HELPER_2(neon_abd_f32, i32, i32, i32) | ... | ... |
target-arm/op_helper.c
... | ... | @@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2) |
495 | 495 | } |
496 | 496 | return res; |
497 | 497 | } |
498 | - | |
499 | -/* These need to return a pair of value, so still use T0/T1. */ | |
500 | -/* Transpose. Argument order is rather strange to avoid special casing | |
501 | - the tranlation code. | |
502 | - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */ | |
503 | -void HELPER(neon_trn_u8)(void) | |
504 | -{ | |
505 | - uint32_t rd; | |
506 | - uint32_t rm; | |
507 | - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff); | |
508 | - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00); | |
509 | - T0 = rd; | |
510 | - T1 = rm; | |
511 | -} | |
512 | - | |
513 | -void HELPER(neon_trn_u16)(void) | |
514 | -{ | |
515 | - uint32_t rd; | |
516 | - uint32_t rm; | |
517 | - rd = (T0 << 16) | (T1 & 0xffff); | |
518 | - rm = (T1 >> 16) | (T0 & 0xffff0000); | |
519 | - T0 = rd; | |
520 | - T1 = rm; | |
521 | -} | |
522 | - | |
523 | -/* Worker routines for zip and unzip. */ | |
524 | -void HELPER(neon_unzip_u8)(void) | |
525 | -{ | |
526 | - uint32_t rd; | |
527 | - uint32_t rm; | |
528 | - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00) | |
529 | - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000); | |
530 | - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) | |
531 | - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); | |
532 | - T0 = rd; | |
533 | - T1 = rm; | |
534 | -} | |
535 | - | |
536 | -void HELPER(neon_zip_u8)(void) | |
537 | -{ | |
538 | - uint32_t rd; | |
539 | - uint32_t rm; | |
540 | - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00) | |
541 | - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000); | |
542 | - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00) | |
543 | - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000); | |
544 | - T0 = rd; | |
545 | - T1 = rm; | |
546 | -} | |
547 | - | |
548 | -void HELPER(neon_zip_u16)(void) | |
549 | -{ | |
550 | - uint32_t tmp; | |
551 | - | |
552 | - tmp = (T0 & 0xffff) | (T1 << 16); | |
553 | - T1 = (T1 & 0xffff0000) | (T0 >> 16); | |
554 | - T0 = tmp; | |
555 | -} | ... | ... |
target-arm/translate.c
... | ... | @@ -3630,24 +3630,157 @@ static inline void gen_neon_get_scalar(int size, int reg) |
3630 | 3630 | } |
3631 | 3631 | } |
3632 | 3632 | |
3633 | +static void gen_neon_unzip_u8(TCGv t0, TCGv t1) | |
3634 | +{ | |
3635 | + TCGv rd, rm, tmp; | |
3636 | + | |
3637 | + rd = new_tmp(); | |
3638 | + rm = new_tmp(); | |
3639 | + tmp = new_tmp(); | |
3640 | + | |
3641 | + tcg_gen_andi_i32(rd, t0, 0xff); | |
3642 | + tcg_gen_shri_i32(tmp, t0, 8); | |
3643 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
3644 | + tcg_gen_or_i32(rd, rd, tmp); | |
3645 | + tcg_gen_shli_i32(tmp, t1, 16); | |
3646 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
3647 | + tcg_gen_or_i32(rd, rd, tmp); | |
3648 | + tcg_gen_shli_i32(tmp, t1, 8); | |
3649 | + tcg_gen_andi_i32(tmp, tmp, 0xff000000); | |
3650 | + tcg_gen_or_i32(rd, rd, tmp); | |
3651 | + | |
3652 | + tcg_gen_shri_i32(rm, t0, 8); | |
3653 | + tcg_gen_andi_i32(rm, rm, 0xff); | |
3654 | + tcg_gen_shri_i32(tmp, t0, 16); | |
3655 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
3656 | + tcg_gen_or_i32(rm, rm, tmp); | |
3657 | + tcg_gen_shli_i32(tmp, t1, 8); | |
3658 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
3659 | + tcg_gen_or_i32(rm, rm, tmp); | |
3660 | + tcg_gen_andi_i32(tmp, t1, 0xff000000); | |
3661 | + tcg_gen_or_i32(t1, rm, tmp); | |
3662 | + tcg_gen_mov_i32(t0, rd); | |
3663 | + | |
3664 | + dead_tmp(tmp); | |
3665 | + dead_tmp(rm); | |
3666 | + dead_tmp(rd); | |
3667 | +} | |
3668 | + | |
3669 | +static void gen_neon_zip_u8(TCGv t0, TCGv t1) | |
3670 | +{ | |
3671 | + TCGv rd, rm, tmp; | |
3672 | + | |
3673 | + rd = new_tmp(); | |
3674 | + rm = new_tmp(); | |
3675 | + tmp = new_tmp(); | |
3676 | + | |
3677 | + tcg_gen_andi_i32(rd, t0, 0xff); | |
3678 | + tcg_gen_shli_i32(tmp, t1, 8); | |
3679 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
3680 | + tcg_gen_or_i32(rd, rd, tmp); | |
3681 | + tcg_gen_shli_i32(tmp, t0, 16); | |
3682 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
3683 | + tcg_gen_or_i32(rd, rd, tmp); | |
3684 | + tcg_gen_shli_i32(tmp, t1, 24); | |
3685 | + tcg_gen_andi_i32(tmp, tmp, 0xff000000); | |
3686 | + tcg_gen_or_i32(rd, rd, tmp); | |
3687 | + | |
3688 | + tcg_gen_andi_i32(rm, t1, 0xff000000); | |
3689 | + tcg_gen_shri_i32(tmp, t0, 8); | |
3690 | + tcg_gen_andi_i32(tmp, tmp, 0xff0000); | |
3691 | + tcg_gen_or_i32(rm, rm, tmp); | |
3692 | + tcg_gen_shri_i32(tmp, t1, 8); | |
3693 | + tcg_gen_andi_i32(tmp, tmp, 0xff00); | |
3694 | + tcg_gen_or_i32(rm, rm, tmp); | |
3695 | + tcg_gen_shri_i32(tmp, t0, 16); | |
3696 | + tcg_gen_andi_i32(tmp, tmp, 0xff); | |
3697 | + tcg_gen_or_i32(t1, rm, tmp); | |
3698 | + tcg_gen_mov_i32(t0, rd); | |
3699 | + | |
3700 | + dead_tmp(tmp); | |
3701 | + dead_tmp(rm); | |
3702 | + dead_tmp(rd); | |
3703 | +} | |
3704 | + | |
3705 | +static void gen_neon_zip_u16(TCGv t0, TCGv t1) | |
3706 | +{ | |
3707 | + TCGv tmp, tmp2; | |
3708 | + | |
3709 | + tmp = new_tmp(); | |
3710 | + tmp2 = new_tmp(); | |
3711 | + | |
3712 | + tcg_gen_andi_i32(tmp, t0, 0xffff); | |
3713 | + tcg_gen_shli_i32(tmp2, t1, 16); | |
3714 | + tcg_gen_or_i32(tmp, tmp, tmp2); | |
3715 | + tcg_gen_andi_i32(t1, t1, 0xffff0000); | |
3716 | + tcg_gen_shri_i32(tmp2, t0, 16); | |
3717 | + tcg_gen_or_i32(t1, t1, tmp2); | |
3718 | + tcg_gen_mov_i32(t0, tmp); | |
3719 | + | |
3720 | + dead_tmp(tmp2); | |
3721 | + dead_tmp(tmp); | |
3722 | +} | |
3723 | + | |
3633 | 3724 | static void gen_neon_unzip(int reg, int q, int tmp, int size) |
3634 | 3725 | { |
3635 | 3726 | int n; |
3636 | - | |
3727 | + | |
3637 | 3728 | for (n = 0; n < q + 1; n += 2) { |
3638 | 3729 | NEON_GET_REG(T0, reg, n); |
3639 | - NEON_GET_REG(T0, reg, n + n); | |
3730 | + NEON_GET_REG(T1, reg, n + 1); | |
3640 | 3731 | switch (size) { |
3641 | - case 0: gen_helper_neon_unzip_u8(); break; | |
3642 | - case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */ | |
3732 | + case 0: gen_neon_unzip_u8(cpu_T[0], cpu_T[1]); break; | |
3733 | + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; /* zip and unzip are the same. */ | |
3643 | 3734 | case 2: /* no-op */; break; |
3644 | 3735 | default: abort(); |
3645 | 3736 | } |
3646 | - gen_neon_movl_scratch_T0(tmp + n); | |
3647 | - gen_neon_movl_scratch_T1(tmp + n + 1); | |
3737 | + gen_neon_movl_T0_scratch(tmp + n); | |
3738 | + gen_neon_movl_T1_scratch(tmp + n + 1); | |
3648 | 3739 | } |
3649 | 3740 | } |
3650 | 3741 | |
3742 | +static void gen_neon_trn_u8(TCGv t0, TCGv t1) | |
3743 | +{ | |
3744 | + TCGv rd, tmp; | |
3745 | + | |
3746 | + rd = new_tmp(); | |
3747 | + tmp = new_tmp(); | |
3748 | + | |
3749 | + tcg_gen_shli_i32(rd, t0, 8); | |
3750 | + tcg_gen_andi_i32(rd, rd, 0xff00ff00); | |
3751 | + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff); | |
3752 | + tcg_gen_or_i32(rd, rd, tmp); | |
3753 | + | |
3754 | + tcg_gen_shri_i32(t1, t1, 8); | |
3755 | + tcg_gen_andi_i32(t1, t1, 0x00ff00ff); | |
3756 | + tcg_gen_andi_i32(tmp, t0, 0xff00ff00); | |
3757 | + tcg_gen_or_i32(t1, t1, tmp); | |
3758 | + tcg_gen_mov_i32(t0, rd); | |
3759 | + | |
3760 | + dead_tmp(tmp); | |
3761 | + dead_tmp(rd); | |
3762 | +} | |
3763 | + | |
3764 | +static void gen_neon_trn_u16(TCGv t0, TCGv t1) | |
3765 | +{ | |
3766 | + TCGv rd, tmp; | |
3767 | + | |
3768 | + rd = new_tmp(); | |
3769 | + tmp = new_tmp(); | |
3770 | + | |
3771 | + tcg_gen_shli_i32(rd, t0, 16); | |
3772 | + tcg_gen_andi_i32(tmp, t1, 0xffff); | |
3773 | + tcg_gen_or_i32(rd, rd, tmp); | |
3774 | + tcg_gen_shri_i32(t1, t1, 16); | |
3775 | + tcg_gen_andi_i32(tmp, t0, 0xffff0000); | |
3776 | + tcg_gen_or_i32(t1, t1, tmp); | |
3777 | + tcg_gen_mov_i32(t0, rd); | |
3778 | + | |
3779 | + dead_tmp(tmp); | |
3780 | + dead_tmp(rd); | |
3781 | +} | |
3782 | + | |
3783 | + | |
3651 | 3784 | static struct { |
3652 | 3785 | int nregs; |
3653 | 3786 | int interleave; |
... | ... | @@ -5259,8 +5392,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) |
5259 | 5392 | NEON_GET_REG(T0, rd, n); |
5260 | 5393 | NEON_GET_REG(T1, rd, n); |
5261 | 5394 | switch (size) { |
5262 | - case 0: gen_helper_neon_zip_u8(); break; | |
5263 | - case 1: gen_helper_neon_zip_u16(); break; | |
5395 | + case 0: gen_neon_zip_u8(cpu_T[0], cpu_T[1]); break; | |
5396 | + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; | |
5264 | 5397 | case 2: /* no-op */; break; |
5265 | 5398 | default: abort(); |
5266 | 5399 | } |
... | ... | @@ -5445,8 +5578,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) |
5445 | 5578 | case 33: /* VTRN */ |
5446 | 5579 | NEON_GET_REG(T1, rd, pass); |
5447 | 5580 | switch (size) { |
5448 | - case 0: gen_helper_neon_trn_u8(); break; | |
5449 | - case 1: gen_helper_neon_trn_u16(); break; | |
5581 | + case 0: gen_neon_trn_u8(cpu_T[0], cpu_T[1]); break; | |
5582 | + case 1: gen_neon_trn_u16(cpu_T[0], cpu_T[1]); break; | |
5450 | 5583 | case 2: abort(); |
5451 | 5584 | default: return 1; |
5452 | 5585 | } | ... | ... |