Commit 4242b1bd8acc19aaaacffdaad4ac23213d72a72b

Authored by balrog
1 parent bb332cb2

Implement x86 SSSE3 instructions.


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5315 c046a42c-6fe2-441c-8c8c-71466251a162
target-i386/ops_sse.h
1 1 /*
2   - * MMX/3DNow!/SSE/SSE2/SSE3/PNI support
  2 + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
3 3 *
4 4 * Copyright (c) 2005 Fabrice Bellard
5 5 *
... ... @@ -1275,6 +1275,151 @@ void helper_pswapd(MMXReg *d, MMXReg *s)
1275 1275 }
1276 1276 #endif
1277 1277  
  1278 +/* SSSE3 op helpers */
  1279 +void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s)
  1280 +{
  1281 + int i;
  1282 + Reg r;
  1283 +
  1284 + for (i = 0; i < (8 << SHIFT); i++)
  1285 + r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
  1286 +
  1287 + *d = r;
  1288 +}
  1289 +
  1290 +void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s)
  1291 +{
  1292 + d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
  1293 + d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
  1294 + XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
  1295 + XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
  1296 + d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
  1297 + d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
  1298 + XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
  1299 + XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
  1300 +}
  1301 +
  1302 +void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s)
  1303 +{
  1304 + d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
  1305 + XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
  1306 + d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
  1307 + XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
  1308 +}
  1309 +
  1310 +void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s)
  1311 +{
  1312 + d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
  1313 + d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
  1314 + XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
  1315 + XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
  1316 + d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
  1317 + d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
  1318 + XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
  1319 + XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
  1320 +}
  1321 +
  1322 +void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s)
  1323 +{
  1324 + d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) +
  1325 + (int8_t)s->B( 1) * (uint8_t)d->B( 1));
  1326 + d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) +
  1327 + (int8_t)s->B( 3) * (uint8_t)d->B( 3));
  1328 + d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) +
  1329 + (int8_t)s->B( 5) * (uint8_t)d->B( 5));
  1330 + d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) +
  1331 + (int8_t)s->B( 7) * (uint8_t)d->B( 7));
  1332 +#if SHIFT == 1
  1333 + d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) +
  1334 + (int8_t)s->B( 9) * (uint8_t)d->B( 9));
  1335 + d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
  1336 + (int8_t)s->B(11) * (uint8_t)d->B(11));
  1337 + d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
  1338 + (int8_t)s->B(13) * (uint8_t)d->B(13));
  1339 + d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
  1340 + (int8_t)s->B(15) * (uint8_t)d->B(15));
  1341 +#endif
  1342 +}
  1343 +
  1344 +void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s)
  1345 +{
  1346 + d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
  1347 + d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
  1348 + XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
  1349 + XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
  1350 + d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
  1351 + d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
  1352 + XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
  1353 + XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
  1354 +}
  1355 +
  1356 +void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s)
  1357 +{
  1358 + d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
  1359 + XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
  1360 + d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
  1361 + XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
  1362 +}
  1363 +
  1364 +void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s)
  1365 +{
  1366 + d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
  1367 + d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
  1368 + XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
  1369 + XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
  1370 + d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
  1371 + d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
  1372 + XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
  1373 + XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
  1374 +}
  1375 +
  1376 +#define FABSB(_, x) x > INT8_MAX ? -(int8_t ) x : x
  1377 +#define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x
  1378 +#define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x
  1379 +SSE_HELPER_B(helper_pabsb, FABSB)
  1380 +SSE_HELPER_W(helper_pabsw, FABSW)
  1381 +SSE_HELPER_L(helper_pabsd, FABSL)
  1382 +
  1383 +#define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15
  1384 +SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
  1385 +
  1386 +#define FSIGNB(d, s) s <= INT8_MAX ? s ? d : 0 : -(int8_t ) d
  1387 +#define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d
  1388 +#define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d
  1389 +SSE_HELPER_B(helper_psignb, FSIGNB)
  1390 +SSE_HELPER_W(helper_psignw, FSIGNW)
  1391 +SSE_HELPER_L(helper_psignd, FSIGNL)
  1392 +
  1393 +void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
  1394 +{
  1395 + Reg r;
  1396 +
  1397 + /* XXX could be checked during translation */
  1398 + if (shift >= (16 << SHIFT)) {
  1399 + r.Q(0) = 0;
  1400 + XMM_ONLY(r.Q(1) = 0);
  1401 + } else {
  1402 + shift <<= 3;
  1403 +#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
  1404 +#if SHIFT == 0
  1405 + r.Q(0) = SHR(s->Q(0), shift - 0) |
  1406 + SHR(d->Q(0), shift - 64);
  1407 +#else
  1408 + r.Q(0) = SHR(s->Q(0), shift - 0) |
  1409 + SHR(s->Q(1), shift - 64) |
  1410 + SHR(d->Q(0), shift - 128) |
  1411 + SHR(d->Q(1), shift - 192);
  1412 + r.Q(1) = SHR(s->Q(0), shift + 64) |
  1413 + SHR(s->Q(1), shift - 0) |
  1414 + SHR(d->Q(0), shift - 64) |
  1415 + SHR(d->Q(1), shift - 128);
  1416 +#endif
  1417 +#undef SHR
  1418 + }
  1419 +
  1420 + *d = r;
  1421 +}
  1422 +
1278 1423 #undef SHIFT
1279 1424 #undef XMM_ONLY
1280 1425 #undef Reg
... ...
target-i386/ops_sse_header.h
1 1 /*
2   - * MMX/3DNow!/SSE/SSE2/SSE3/PNI support
  2 + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
3 3 *
4 4 * Copyright (c) 2005 Fabrice Bellard
5 5 *
... ... @@ -251,6 +251,24 @@ DEF_HELPER(void, helper_pfsubr, (MMXReg *d, MMXReg *s))
251 251 DEF_HELPER(void, helper_pswapd, (MMXReg *d, MMXReg *s))
252 252 #endif
253 253  
  254 +/* SSSE3 op helpers */
  255 +DEF_HELPER(void, glue(helper_phaddw, SUFFIX), (Reg *d, Reg *s))
  256 +DEF_HELPER(void, glue(helper_phaddd, SUFFIX), (Reg *d, Reg *s))
  257 +DEF_HELPER(void, glue(helper_phaddsw, SUFFIX), (Reg *d, Reg *s))
  258 +DEF_HELPER(void, glue(helper_phsubw, SUFFIX), (Reg *d, Reg *s))
  259 +DEF_HELPER(void, glue(helper_phsubd, SUFFIX), (Reg *d, Reg *s))
  260 +DEF_HELPER(void, glue(helper_phsubsw, SUFFIX), (Reg *d, Reg *s))
  261 +DEF_HELPER(void, glue(helper_pabsb, SUFFIX), (Reg *d, Reg *s))
  262 +DEF_HELPER(void, glue(helper_pabsw, SUFFIX), (Reg *d, Reg *s))
  263 +DEF_HELPER(void, glue(helper_pabsd, SUFFIX), (Reg *d, Reg *s))
  264 +DEF_HELPER(void, glue(helper_pmaddubsw, SUFFIX), (Reg *d, Reg *s))
  265 +DEF_HELPER(void, glue(helper_pmulhrsw, SUFFIX), (Reg *d, Reg *s))
  266 +DEF_HELPER(void, glue(helper_pshufb, SUFFIX), (Reg *d, Reg *s))
  267 +DEF_HELPER(void, glue(helper_psignb, SUFFIX), (Reg *d, Reg *s))
  268 +DEF_HELPER(void, glue(helper_psignw, SUFFIX), (Reg *d, Reg *s))
  269 +DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s))
  270 +DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, int32_t shift))
  271 +
254 272 #undef SHIFT
255 273 #undef Reg
256 274 #undef SUFFIX
... ...
target-i386/translate.c
... ... @@ -2770,6 +2770,9 @@ static void *sse_op_table1[256][4] = {
2770 2770 [0xc2] = SSE_FOP(cmpeq),
2771 2771 [0xc6] = { helper_shufps, helper_shufpd },
2772 2772  
  2773 + [0x38] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */
  2774 + [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */
  2775 +
2773 2776 /* MMX ops and their SSE extensions */
2774 2777 [0x60] = MMX_OP2(punpcklbw),
2775 2778 [0x61] = MMX_OP2(punpcklwd),
... ... @@ -2921,6 +2924,28 @@ static void *sse_op_table5[256] = {
2921 2924 [0xbf] = helper_pavgb_mmx /* pavgusb */
2922 2925 };
2923 2926  
  2927 +static void *sse_op_table6[256][2] = {
  2928 + [0x00] = MMX_OP2(pshufb),
  2929 + [0x01] = MMX_OP2(phaddw),
  2930 + [0x02] = MMX_OP2(phaddd),
  2931 + [0x03] = MMX_OP2(phaddsw),
  2932 + [0x04] = MMX_OP2(pmaddubsw),
  2933 + [0x05] = MMX_OP2(phsubw),
  2934 + [0x06] = MMX_OP2(phsubd),
  2935 + [0x07] = MMX_OP2(phsubsw),
  2936 + [0x08] = MMX_OP2(psignb),
  2937 + [0x09] = MMX_OP2(psignw),
  2938 + [0x0a] = MMX_OP2(psignd),
  2939 + [0x0b] = MMX_OP2(pmulhrsw),
  2940 + [0x1c] = MMX_OP2(pabsb),
  2941 + [0x1d] = MMX_OP2(pabsw),
  2942 + [0x1e] = MMX_OP2(pabsd),
  2943 +};
  2944 +
  2945 +static void *sse_op_table7[256][2] = {
  2946 + [0x0f] = MMX_OP2(palignr),
  2947 +};
  2948 +
2924 2949 static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
2925 2950 {
2926 2951 int b1, op1_offset, op2_offset, is_xmm, val, ot;
... ... @@ -2960,7 +2985,8 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
2960 2985 return;
2961 2986 }
2962 2987 if (is_xmm && !(s->flags & HF_OSFXSR_MASK))
2963   - goto illegal_op;
  2988 + if ((b != 0x38 && b != 0x3a) || (s->prefix & PREFIX_DATA))
  2989 + goto illegal_op;
2964 2990 if (b == 0x0e) {
2965 2991 if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW))
2966 2992 goto illegal_op;
... ... @@ -3482,6 +3508,84 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
3482 3508 reg = ((modrm >> 3) & 7) | rex_r;
3483 3509 gen_op_mov_reg_T0(OT_LONG, reg);
3484 3510 break;
  3511 + case 0x038:
  3512 + case 0x138:
  3513 + if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3))
  3514 + goto illegal_op;
  3515 +
  3516 + b = modrm;
  3517 + modrm = ldub_code(s->pc++);
  3518 + rm = modrm & 7;
  3519 + reg = ((modrm >> 3) & 7) | rex_r;
  3520 + mod = (modrm >> 6) & 3;
  3521 +
  3522 + sse_op2 = sse_op_table6[b][b1];
  3523 + if (!sse_op2)
  3524 + goto illegal_op;
  3525 +
  3526 + if (b1) {
  3527 + op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
  3528 + if (mod == 3) {
  3529 + op2_offset = offsetof(CPUX86State,xmm_regs[rm | REX_B(s)]);
  3530 + } else {
  3531 + op2_offset = offsetof(CPUX86State,xmm_t0);
  3532 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  3533 + gen_ldo_env_A0(s->mem_index, op2_offset);
  3534 + }
  3535 + } else {
  3536 + op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
  3537 + if (mod == 3) {
  3538 + op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
  3539 + } else {
  3540 + op2_offset = offsetof(CPUX86State,mmx_t0);
  3541 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  3542 + gen_ldq_env_A0(s->mem_index, op2_offset);
  3543 + }
  3544 + }
  3545 + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
  3546 + tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
  3547 + tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1);
  3548 + break;
  3549 + case 0x03a:
  3550 + case 0x13a:
  3551 + if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3))
  3552 + goto illegal_op;
  3553 +
  3554 + b = modrm;
  3555 + modrm = ldub_code(s->pc++);
  3556 + rm = modrm & 7;
  3557 + reg = ((modrm >> 3) & 7) | rex_r;
  3558 + mod = (modrm >> 6) & 3;
  3559 +
  3560 + sse_op2 = sse_op_table7[b][b1];
  3561 + if (!sse_op2)
  3562 + goto illegal_op;
  3563 +
  3564 + if (b1) {
  3565 + op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
  3566 + if (mod == 3) {
  3567 + op2_offset = offsetof(CPUX86State,xmm_regs[rm | REX_B(s)]);
  3568 + } else {
  3569 + op2_offset = offsetof(CPUX86State,xmm_t0);
  3570 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  3571 + gen_ldo_env_A0(s->mem_index, op2_offset);
  3572 + }
  3573 + } else {
  3574 + op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
  3575 + if (mod == 3) {
  3576 + op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
  3577 + } else {
  3578 + op2_offset = offsetof(CPUX86State,mmx_t0);
  3579 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  3580 + gen_ldq_env_A0(s->mem_index, op2_offset);
  3581 + }
  3582 + }
  3583 + val = ldub_code(s->pc++);
  3584 +
  3585 + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
  3586 + tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
  3587 + tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
  3588 + break;
3485 3589 default:
3486 3590 goto illegal_op;
3487 3591 }
... ... @@ -6987,7 +7091,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
6987 7091 gen_eob(s);
6988 7092 }
6989 7093 break;
6990   - /* MMX/3DNow!/SSE/SSE2/SSE3 support */
  7094 + /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3 support */
6991 7095 case 0x1c3: /* MOVNTI reg, mem */
6992 7096 if (!(s->cpuid_features & CPUID_SSE2))
6993 7097 goto illegal_op;
... ... @@ -7100,6 +7204,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
7100 7204 s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA);
7101 7205 case 0x110 ... 0x117:
7102 7206 case 0x128 ... 0x12f:
  7207 + case 0x138 ... 0x13a:
7103 7208 case 0x150 ... 0x177:
7104 7209 case 0x17c ... 0x17f:
7105 7210 case 0x1c2:
... ...