Commit 222a3336ecbe177da082f9ac20f9614d6d23c721
1 parent
06adb549
Implement SSE4.1, SSE4.2 (x86).
This adds support for CPUID_EXT_SSE41, CPUID_EXT_SSE42, CPUID_EXT_POPCNT extensions. Most instructions haven't been tested yet. git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5411 c046a42c-6fe2-441c-8c8c-71466251a162
Showing
4 changed files
with
992 additions
and
37 deletions
target-i386/ops_sse.h
1 | 1 | /* |
2 | - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support | |
2 | + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support | |
3 | 3 | * |
4 | 4 | * Copyright (c) 2005 Fabrice Bellard |
5 | + * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> | |
5 | 6 | * |
6 | 7 | * This library is free software; you can redistribute it and/or |
7 | 8 | * modify it under the terms of the GNU Lesser General Public |
... | ... | @@ -1420,6 +1421,621 @@ void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift) |
1420 | 1421 | *d = r; |
1421 | 1422 | } |
1422 | 1423 | |
1424 | +#define XMM0 env->xmm_regs[0] | |
1425 | + | |
1426 | +#if SHIFT == 1 | |
1427 | +#define SSE_HELPER_V(name, elem, num, F)\ | |
1428 | +void glue(name, SUFFIX) (Reg *d, Reg *s)\ | |
1429 | +{\ | |
1430 | + d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\ | |
1431 | + d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\ | |
1432 | + if (num > 2) {\ | |
1433 | + d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\ | |
1434 | + d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\ | |
1435 | + if (num > 4) {\ | |
1436 | + d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\ | |
1437 | + d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\ | |
1438 | + d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\ | |
1439 | + d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\ | |
1440 | + if (num > 8) {\ | |
1441 | + d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\ | |
1442 | + d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\ | |
1443 | + d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\ | |
1444 | + d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\ | |
1445 | + d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\ | |
1446 | + d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\ | |
1447 | + d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\ | |
1448 | + d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\ | |
1449 | + }\ | |
1450 | + }\ | |
1451 | + }\ | |
1452 | +} | |
1453 | + | |
1454 | +#define SSE_HELPER_I(name, elem, num, F)\ | |
1455 | +void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\ | |
1456 | +{\ | |
1457 | + d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\ | |
1458 | + d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\ | |
1459 | + if (num > 2) {\ | |
1460 | + d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\ | |
1461 | + d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\ | |
1462 | + if (num > 4) {\ | |
1463 | + d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\ | |
1464 | + d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\ | |
1465 | + d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\ | |
1466 | + d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\ | |
1467 | + if (num > 8) {\ | |
1468 | + d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\ | |
1469 | + d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\ | |
1470 | + d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\ | |
1471 | + d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\ | |
1472 | + d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\ | |
1473 | + d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\ | |
1474 | + d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\ | |
1475 | + d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\ | |
1476 | + }\ | |
1477 | + }\ | |
1478 | + }\ | |
1479 | +} | |
1480 | + | |
1481 | +/* SSE4.1 op helpers */ | |
1482 | +#define FBLENDVB(d, s, m) (m & 0x80) ? s : d | |
1483 | +#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d | |
1484 | +#define FBLENDVPD(d, s, m) (m & 0x8000000000000000) ? s : d | |
1485 | +SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) | |
1486 | +SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) | |
1487 | +SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) | |
1488 | + | |
1489 | +void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s) | |
1490 | +{ | |
1491 | + uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); | |
1492 | + uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); | |
1493 | + | |
1494 | + CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); | |
1495 | +} | |
1496 | + | |
1497 | +#define SSE_HELPER_F(name, elem, num, F)\ | |
1498 | +void glue(name, SUFFIX) (Reg *d, Reg *s)\ | |
1499 | +{\ | |
1500 | + d->elem(0) = F(0);\ | |
1501 | + d->elem(1) = F(1);\ | |
1502 | + d->elem(2) = F(2);\ | |
1503 | + d->elem(3) = F(3);\ | |
1504 | + if (num > 3) {\ | |
1505 | + d->elem(4) = F(4);\ | |
1506 | + d->elem(5) = F(5);\ | |
1507 | + if (num > 5) {\ | |
1508 | + d->elem(6) = F(6);\ | |
1509 | + d->elem(7) = F(7);\ | |
1510 | + }\ | |
1511 | + }\ | |
1512 | +} | |
1513 | + | |
1514 | +SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B) | |
1515 | +SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B) | |
1516 | +SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B) | |
1517 | +SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W) | |
1518 | +SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W) | |
1519 | +SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L) | |
1520 | +SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B) | |
1521 | +SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B) | |
1522 | +SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B) | |
1523 | +SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) | |
1524 | +SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) | |
1525 | +SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) | |
1526 | + | |
1527 | +void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s) | |
1528 | +{ | |
1529 | + d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0); | |
1530 | + d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2); | |
1531 | +} | |
1532 | + | |
1533 | +#define FCMPEQQ(d, s) d == s ? -1 : 0 | |
1534 | +SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) | |
1535 | + | |
1536 | +void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s) | |
1537 | +{ | |
1538 | + d->W(0) = satuw((int32_t) d->L(0)); | |
1539 | + d->W(1) = satuw((int32_t) d->L(1)); | |
1540 | + d->W(2) = satuw((int32_t) d->L(2)); | |
1541 | + d->W(3) = satuw((int32_t) d->L(3)); | |
1542 | + d->W(4) = satuw((int32_t) s->L(0)); | |
1543 | + d->W(5) = satuw((int32_t) s->L(1)); | |
1544 | + d->W(6) = satuw((int32_t) s->L(2)); | |
1545 | + d->W(7) = satuw((int32_t) s->L(3)); | |
1546 | +} | |
1547 | + | |
1548 | +#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s) | |
1549 | +#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s) | |
1550 | +#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s) | |
1551 | +#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s) | |
1552 | +SSE_HELPER_B(helper_pminsb, FMINSB) | |
1553 | +SSE_HELPER_L(helper_pminsd, FMINSD) | |
1554 | +SSE_HELPER_W(helper_pminuw, MIN) | |
1555 | +SSE_HELPER_L(helper_pminud, MIN) | |
1556 | +SSE_HELPER_B(helper_pmaxsb, FMAXSB) | |
1557 | +SSE_HELPER_L(helper_pmaxsd, FMAXSD) | |
1558 | +SSE_HELPER_W(helper_pmaxuw, MAX) | |
1559 | +SSE_HELPER_L(helper_pmaxud, MAX) | |
1560 | + | |
1561 | +#define FMULLD(d, s) (int32_t) d * (int32_t) s | |
1562 | +SSE_HELPER_L(helper_pmulld, FMULLD) | |
1563 | + | |
1564 | +void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s) | |
1565 | +{ | |
1566 | + int idx = 0; | |
1567 | + | |
1568 | + if (s->W(1) < s->W(idx)) | |
1569 | + idx = 1; | |
1570 | + if (s->W(2) < s->W(idx)) | |
1571 | + idx = 2; | |
1572 | + if (s->W(3) < s->W(idx)) | |
1573 | + idx = 3; | |
1574 | + if (s->W(4) < s->W(idx)) | |
1575 | + idx = 4; | |
1576 | + if (s->W(5) < s->W(idx)) | |
1577 | + idx = 5; | |
1578 | + if (s->W(6) < s->W(idx)) | |
1579 | + idx = 6; | |
1580 | + if (s->W(7) < s->W(idx)) | |
1581 | + idx = 7; | |
1582 | + | |
1583 | + d->Q(1) = 0; | |
1584 | + d->L(1) = 0; | |
1585 | + d->W(1) = idx; | |
1586 | + d->W(0) = s->W(idx); | |
1587 | +} | |
1588 | + | |
1589 | +void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | |
1590 | +{ | |
1591 | + signed char prev_rounding_mode; | |
1592 | + | |
1593 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | |
1594 | + if (!(mode & (1 << 2))) | |
1595 | + switch (mode & 3) { | |
1596 | + case 0: | |
1597 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | |
1598 | + break; | |
1599 | + case 1: | |
1600 | + set_float_rounding_mode(float_round_down, &env->sse_status); | |
1601 | + break; | |
1602 | + case 2: | |
1603 | + set_float_rounding_mode(float_round_up, &env->sse_status); | |
1604 | + break; | |
1605 | + case 3: | |
1606 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | |
1607 | + break; | |
1608 | + } | |
1609 | + | |
1610 | + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); | |
1611 | + d->L(1) = float64_round_to_int(s->L(1), &env->sse_status); | |
1612 | + d->L(2) = float64_round_to_int(s->L(2), &env->sse_status); | |
1613 | + d->L(3) = float64_round_to_int(s->L(3), &env->sse_status); | |
1614 | + | |
1615 | +#if 0 /* TODO */ | |
1616 | + if (mode & (1 << 3)) | |
1617 | + set_float_exception_flags( | |
1618 | + get_float_exception_flags(&env->sse_status) & | |
1619 | + ~float_flag_inexact, | |
1620 | + &env->sse_status); | |
1621 | +#endif | |
1622 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | |
1623 | +} | |
1624 | + | |
1625 | +void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | |
1626 | +{ | |
1627 | + signed char prev_rounding_mode; | |
1628 | + | |
1629 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | |
1630 | + if (!(mode & (1 << 2))) | |
1631 | + switch (mode & 3) { | |
1632 | + case 0: | |
1633 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | |
1634 | + break; | |
1635 | + case 1: | |
1636 | + set_float_rounding_mode(float_round_down, &env->sse_status); | |
1637 | + break; | |
1638 | + case 2: | |
1639 | + set_float_rounding_mode(float_round_up, &env->sse_status); | |
1640 | + break; | |
1641 | + case 3: | |
1642 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | |
1643 | + break; | |
1644 | + } | |
1645 | + | |
1646 | + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); | |
1647 | + d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status); | |
1648 | + | |
1649 | +#if 0 /* TODO */ | |
1650 | + if (mode & (1 << 3)) | |
1651 | + set_float_exception_flags( | |
1652 | + get_float_exception_flags(&env->sse_status) & | |
1653 | + ~float_flag_inexact, | |
1654 | + &env->sse_status); | |
1655 | +#endif | |
1656 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | |
1657 | +} | |
1658 | + | |
1659 | +void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | |
1660 | +{ | |
1661 | + signed char prev_rounding_mode; | |
1662 | + | |
1663 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | |
1664 | + if (!(mode & (1 << 2))) | |
1665 | + switch (mode & 3) { | |
1666 | + case 0: | |
1667 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | |
1668 | + break; | |
1669 | + case 1: | |
1670 | + set_float_rounding_mode(float_round_down, &env->sse_status); | |
1671 | + break; | |
1672 | + case 2: | |
1673 | + set_float_rounding_mode(float_round_up, &env->sse_status); | |
1674 | + break; | |
1675 | + case 3: | |
1676 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | |
1677 | + break; | |
1678 | + } | |
1679 | + | |
1680 | + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); | |
1681 | + | |
1682 | +#if 0 /* TODO */ | |
1683 | + if (mode & (1 << 3)) | |
1684 | + set_float_exception_flags( | |
1685 | + get_float_exception_flags(&env->sse_status) & | |
1686 | + ~float_flag_inexact, | |
1687 | + &env->sse_status); | |
1688 | +#endif | |
1689 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | |
1690 | +} | |
1691 | + | |
1692 | +void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | |
1693 | +{ | |
1694 | + signed char prev_rounding_mode; | |
1695 | + | |
1696 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | |
1697 | + if (!(mode & (1 << 2))) | |
1698 | + switch (mode & 3) { | |
1699 | + case 0: | |
1700 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | |
1701 | + break; | |
1702 | + case 1: | |
1703 | + set_float_rounding_mode(float_round_down, &env->sse_status); | |
1704 | + break; | |
1705 | + case 2: | |
1706 | + set_float_rounding_mode(float_round_up, &env->sse_status); | |
1707 | + break; | |
1708 | + case 3: | |
1709 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | |
1710 | + break; | |
1711 | + } | |
1712 | + | |
1713 | + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); | |
1714 | + | |
1715 | +#if 0 /* TODO */ | |
1716 | + if (mode & (1 << 3)) | |
1717 | + set_float_exception_flags( | |
1718 | + get_float_exception_flags(&env->sse_status) & | |
1719 | + ~float_flag_inexact, | |
1720 | + &env->sse_status); | |
1721 | +#endif | |
1722 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | |
1723 | +} | |
1724 | + | |
1725 | +#define FBLENDP(d, s, m) m ? s : d | |
1726 | +SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) | |
1727 | +SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) | |
1728 | +SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) | |
1729 | + | |
1730 | +void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask) | |
1731 | +{ | |
1732 | + float32 iresult = 0 /*float32_zero*/; | |
1733 | + | |
1734 | + if (mask & (1 << 4)) | |
1735 | + iresult = float32_add(iresult, | |
1736 | + float32_mul(d->L(0), s->L(0), &env->sse_status), | |
1737 | + &env->sse_status); | |
1738 | + if (mask & (1 << 5)) | |
1739 | + iresult = float32_add(iresult, | |
1740 | + float32_mul(d->L(1), s->L(1), &env->sse_status), | |
1741 | + &env->sse_status); | |
1742 | + if (mask & (1 << 6)) | |
1743 | + iresult = float32_add(iresult, | |
1744 | + float32_mul(d->L(2), s->L(2), &env->sse_status), | |
1745 | + &env->sse_status); | |
1746 | + if (mask & (1 << 7)) | |
1747 | + iresult = float32_add(iresult, | |
1748 | + float32_mul(d->L(3), s->L(3), &env->sse_status), | |
1749 | + &env->sse_status); | |
1750 | + d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/; | |
1751 | + d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/; | |
1752 | + d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/; | |
1753 | + d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/; | |
1754 | +} | |
1755 | + | |
1756 | +void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask) | |
1757 | +{ | |
1758 | + float64 iresult = 0 /*float64_zero*/; | |
1759 | + | |
1760 | + if (mask & (1 << 4)) | |
1761 | + iresult = float64_add(iresult, | |
1762 | + float64_mul(d->Q(0), s->Q(0), &env->sse_status), | |
1763 | + &env->sse_status); | |
1764 | + if (mask & (1 << 5)) | |
1765 | + iresult = float64_add(iresult, | |
1766 | + float64_mul(d->Q(1), s->Q(1), &env->sse_status), | |
1767 | + &env->sse_status); | |
1768 | + d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/; | |
1769 | + d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/; | |
1770 | +} | |
1771 | + | |
1772 | +void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset) | |
1773 | +{ | |
1774 | + int s0 = (offset & 3) << 2; | |
1775 | + int d0 = (offset & 4) << 0; | |
1776 | + int i; | |
1777 | + Reg r; | |
1778 | + | |
1779 | + for (i = 0; i < 8; i++, d0++) { | |
1780 | + r.W(i) = 0; | |
1781 | + r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); | |
1782 | + r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); | |
1783 | + r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); | |
1784 | + r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); | |
1785 | + } | |
1786 | + | |
1787 | + *d = r; | |
1788 | +} | |
1789 | + | |
1790 | +/* SSE4.2 op helpers */ | |
1791 | +/* it's unclear whether signed or unsigned */ | |
1792 | +#define FCMPGTQ(d, s) d > s ? -1 : 0 | |
1793 | +SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) | |
1794 | + | |
1795 | +static inline int pcmp_elen(int reg, uint32_t ctrl) | |
1796 | +{ | |
1797 | + int val; | |
1798 | + | |
1799 | + /* Presence of REX.W is indicated by a bit higher than 7 set */ | |
1800 | + if (ctrl >> 8) | |
1801 | + val = abs1((int64_t) env->regs[reg]); | |
1802 | + else | |
1803 | + val = abs1((int32_t) env->regs[reg]); | |
1804 | + | |
1805 | + if (ctrl & 1) { | |
1806 | + if (val > 8) | |
1807 | + return 8; | |
1808 | + } else | |
1809 | + if (val > 16) | |
1810 | + return 16; | |
1811 | + | |
1812 | + return val; | |
1813 | +} | |
1814 | + | |
1815 | +static inline int pcmp_ilen(Reg *r, uint8_t ctrl) | |
1816 | +{ | |
1817 | + int val = 0; | |
1818 | + | |
1819 | + if (ctrl & 1) { | |
1820 | + while (val < 8 && r->W(val)) | |
1821 | + val++; | |
1822 | + } else | |
1823 | + while (val < 16 && r->B(val)) | |
1824 | + val++; | |
1825 | + | |
1826 | + return val; | |
1827 | +} | |
1828 | + | |
1829 | +static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) | |
1830 | +{ | |
1831 | + switch ((ctrl >> 0) & 3) { | |
1832 | + case 0: | |
1833 | + return r->B(i); | |
1834 | + case 1: | |
1835 | + return r->W(i); | |
1836 | + case 2: | |
1837 | + return (int8_t) r->B(i); | |
1838 | + case 3: | |
1839 | + default: | |
1840 | + return (int16_t) r->W(i); | |
1841 | + } | |
1842 | +} | |
1843 | + | |
1844 | +static inline unsigned pcmpxstrx(Reg *d, Reg *s, | |
1845 | + int8_t ctrl, int valids, int validd) | |
1846 | +{ | |
1847 | + unsigned int res = 0; | |
1848 | + int v; | |
1849 | + int j, i; | |
1850 | + int upper = (ctrl & 1) ? 7 : 15; | |
1851 | + | |
1852 | + valids--; | |
1853 | + validd--; | |
1854 | + | |
1855 | + CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); | |
1856 | + | |
1857 | + switch ((ctrl >> 2) & 3) { | |
1858 | + case 0: | |
1859 | + for (j = valids; j >= 0; j--) { | |
1860 | + res <<= 1; | |
1861 | + v = pcmp_val(s, ctrl, j); | |
1862 | + for (i = validd; i >= 0; i--) | |
1863 | + res |= (v == pcmp_val(d, ctrl, i)); | |
1864 | + } | |
1865 | + break; | |
1866 | + case 1: | |
1867 | + for (j = valids; j >= 0; j--) { | |
1868 | + res <<= 1; | |
1869 | + v = pcmp_val(s, ctrl, j); | |
1870 | + for (i = ((validd - 1) | 1); i >= 0; i -= 2) | |
1871 | + res |= (pcmp_val(d, ctrl, i - 0) <= v && | |
1872 | + pcmp_val(d, ctrl, i - 1) >= v); | |
1873 | + } | |
1874 | + break; | |
1875 | + case 2: | |
1876 | + res = (2 << (upper - MAX(valids, validd))) - 1; | |
1877 | + res <<= MAX(valids, validd) - MIN(valids, validd); | |
1878 | + for (i = MIN(valids, validd); i >= 0; i--) { | |
1879 | + res <<= 1; | |
1880 | + v = pcmp_val(s, ctrl, i); | |
1881 | + res |= (v == pcmp_val(d, ctrl, i)); | |
1882 | + } | |
1883 | + break; | |
1884 | + case 3: | |
1885 | + for (j = valids - validd; j >= 0; j--) { | |
1886 | + res <<= 1; | |
1887 | + res |= 1; | |
1888 | + for (i = MIN(upper - j, validd); i >= 0; i--) | |
1889 | + res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); | |
1890 | + } | |
1891 | + break; | |
1892 | + } | |
1893 | + | |
1894 | + switch ((ctrl >> 4) & 3) { | |
1895 | + case 1: | |
1896 | + res ^= (2 << upper) - 1; | |
1897 | + break; | |
1898 | + case 3: | |
1899 | + res ^= (2 << valids) - 1; | |
1900 | + break; | |
1901 | + } | |
1902 | + | |
1903 | + if (res) | |
1904 | + CC_SRC |= CC_C; | |
1905 | + if (res & 1) | |
1906 | + CC_SRC |= CC_O; | |
1907 | + | |
1908 | + return res; | |
1909 | +} | |
1910 | + | |
1911 | +static inline int rffs1(unsigned int val) | |
1912 | +{ | |
1913 | + int ret = 1, hi; | |
1914 | + | |
1915 | + for (hi = sizeof(val) * 4; hi; hi /= 2) | |
1916 | + if (val >> hi) { | |
1917 | + val >>= hi; | |
1918 | + ret += hi; | |
1919 | + } | |
1920 | + | |
1921 | + return ret; | |
1922 | +} | |
1923 | + | |
1924 | +static inline int ffs1(unsigned int val) | |
1925 | +{ | |
1926 | + int ret = 1, hi; | |
1927 | + | |
1928 | + for (hi = sizeof(val) * 4; hi; hi /= 2) | |
1929 | + if (val << hi) { | |
1930 | + val <<= hi; | |
1931 | + ret += hi; | |
1932 | + } | |
1933 | + | |
1934 | + return ret; | |
1935 | +} | |
1936 | + | |
1937 | +void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | |
1938 | +{ | |
1939 | + unsigned int res = pcmpxstrx(d, s, ctrl, | |
1940 | + pcmp_elen(R_EDX, ctrl), | |
1941 | + pcmp_elen(R_EAX, ctrl)); | |
1942 | + | |
1943 | + if (res) | |
1944 | + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; | |
1945 | + else | |
1946 | + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); | |
1947 | +} | |
1948 | + | |
1949 | +void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | |
1950 | +{ | |
1951 | + int i; | |
1952 | + unsigned int res = pcmpxstrx(d, s, ctrl, | |
1953 | + pcmp_elen(R_EDX, ctrl), | |
1954 | + pcmp_elen(R_EAX, ctrl)); | |
1955 | + | |
1956 | + if ((ctrl >> 6) & 1) { | |
1957 | + if (ctrl & 1) | |
1958 | + for (i = 0; i <= 8; i--, res >>= 1) | |
1959 | + d->W(i) = (res & 1) ? ~0 : 0; | |
1960 | + else | |
1961 | + for (i = 0; i <= 16; i--, res >>= 1) | |
1962 | + d->B(i) = (res & 1) ? ~0 : 0; | |
1963 | + } else { | |
1964 | + d->Q(1) = 0; | |
1965 | + d->Q(0) = res; | |
1966 | + } | |
1967 | +} | |
1968 | + | |
1969 | +void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | |
1970 | +{ | |
1971 | + unsigned int res = pcmpxstrx(d, s, ctrl, | |
1972 | + pcmp_ilen(s, ctrl), | |
1973 | + pcmp_ilen(d, ctrl)); | |
1974 | + | |
1975 | + if (res) | |
1976 | + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; | |
1977 | + else | |
1978 | + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); | |
1979 | +} | |
1980 | + | |
1981 | +void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | |
1982 | +{ | |
1983 | + int i; | |
1984 | + unsigned int res = pcmpxstrx(d, s, ctrl, | |
1985 | + pcmp_ilen(s, ctrl), | |
1986 | + pcmp_ilen(d, ctrl)); | |
1987 | + | |
1988 | + if ((ctrl >> 6) & 1) { | |
1989 | + if (ctrl & 1) | |
1990 | + for (i = 0; i <= 8; i--, res >>= 1) | |
1991 | + d->W(i) = (res & 1) ? ~0 : 0; | |
1992 | + else | |
1993 | + for (i = 0; i <= 16; i--, res >>= 1) | |
1994 | + d->B(i) = (res & 1) ? ~0 : 0; | |
1995 | + } else { | |
1996 | + d->Q(1) = 0; | |
1997 | + d->Q(0) = res; | |
1998 | + } | |
1999 | +} | |
2000 | + | |
2001 | +#define CRCPOLY 0x1edc6f41 | |
2002 | +#define CRCPOLY_BITREV 0x82f63b78 | |
2003 | +target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) | |
2004 | +{ | |
2005 | + target_ulong crc = (msg & ((target_ulong) -1 >> | |
2006 | + (TARGET_LONG_BITS - len))) ^ crc1; | |
2007 | + | |
2008 | + while (len--) | |
2009 | + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); | |
2010 | + | |
2011 | + return crc; | |
2012 | +} | |
2013 | + | |
2014 | +#define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1)) | |
2015 | +#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i)) | |
2016 | +target_ulong helper_popcnt(target_ulong n, uint32_t type) | |
2017 | +{ | |
2018 | + CC_SRC = n ? 0 : CC_Z; | |
2019 | + | |
2020 | + n = POPCOUNT(n, 0); | |
2021 | + n = POPCOUNT(n, 1); | |
2022 | + n = POPCOUNT(n, 2); | |
2023 | + n = POPCOUNT(n, 3); | |
2024 | + if (type == 1) | |
2025 | + return n & 0xff; | |
2026 | + | |
2027 | + n = POPCOUNT(n, 4); | |
2028 | +#ifndef TARGET_X86_64 | |
2029 | + return n; | |
2030 | +#else | |
2031 | + if (type == 2) | |
2032 | + return n & 0xff; | |
2033 | + | |
2034 | + return POPCOUNT(n, 5); | |
2035 | +#endif | |
2036 | +} | |
2037 | +#endif | |
2038 | + | |
1423 | 2039 | #undef SHIFT |
1424 | 2040 | #undef XMM_ONLY |
1425 | 2041 | #undef Reg | ... | ... |
target-i386/ops_sse_header.h
1 | 1 | /* |
2 | - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support | |
2 | + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support | |
3 | 3 | * |
4 | 4 | * Copyright (c) 2005 Fabrice Bellard |
5 | 5 | * |
... | ... | @@ -269,6 +269,61 @@ DEF_HELPER(void, glue(helper_psignw, SUFFIX), (Reg *d, Reg *s)) |
269 | 269 | DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s)) |
270 | 270 | DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, int32_t shift)) |
271 | 271 | |
272 | +/* SSE4.1 op helpers */ | |
273 | +#if SHIFT == 1 | |
274 | +DEF_HELPER(void, glue(helper_pblendvb, SUFFIX), (Reg *d, Reg *s)) | |
275 | +DEF_HELPER(void, glue(helper_blendvps, SUFFIX), (Reg *d, Reg *s)) | |
276 | +DEF_HELPER(void, glue(helper_blendvpd, SUFFIX), (Reg *d, Reg *s)) | |
277 | +DEF_HELPER(void, glue(helper_ptest, SUFFIX), (Reg *d, Reg *s)) | |
278 | +DEF_HELPER(void, glue(helper_pmovsxbw, SUFFIX), (Reg *d, Reg *s)) | |
279 | +DEF_HELPER(void, glue(helper_pmovsxbd, SUFFIX), (Reg *d, Reg *s)) | |
280 | +DEF_HELPER(void, glue(helper_pmovsxbq, SUFFIX), (Reg *d, Reg *s)) | |
281 | +DEF_HELPER(void, glue(helper_pmovsxwd, SUFFIX), (Reg *d, Reg *s)) | |
282 | +DEF_HELPER(void, glue(helper_pmovsxwq, SUFFIX), (Reg *d, Reg *s)) | |
283 | +DEF_HELPER(void, glue(helper_pmovsxdq, SUFFIX), (Reg *d, Reg *s)) | |
284 | +DEF_HELPER(void, glue(helper_pmovzxbw, SUFFIX), (Reg *d, Reg *s)) | |
285 | +DEF_HELPER(void, glue(helper_pmovzxbd, SUFFIX), (Reg *d, Reg *s)) | |
286 | +DEF_HELPER(void, glue(helper_pmovzxbq, SUFFIX), (Reg *d, Reg *s)) | |
287 | +DEF_HELPER(void, glue(helper_pmovzxwd, SUFFIX), (Reg *d, Reg *s)) | |
288 | +DEF_HELPER(void, glue(helper_pmovzxwq, SUFFIX), (Reg *d, Reg *s)) | |
289 | +DEF_HELPER(void, glue(helper_pmovzxdq, SUFFIX), (Reg *d, Reg *s)) | |
290 | +DEF_HELPER(void, glue(helper_pmuldq, SUFFIX), (Reg *d, Reg *s)) | |
291 | +DEF_HELPER(void, glue(helper_pcmpeqq, SUFFIX), (Reg *d, Reg *s)) | |
292 | +DEF_HELPER(void, glue(helper_packusdw, SUFFIX), (Reg *d, Reg *s)) | |
293 | +DEF_HELPER(void, glue(helper_pminsb, SUFFIX), (Reg *d, Reg *s)) | |
294 | +DEF_HELPER(void, glue(helper_pminsd, SUFFIX), (Reg *d, Reg *s)) | |
295 | +DEF_HELPER(void, glue(helper_pminuw, SUFFIX), (Reg *d, Reg *s)) | |
296 | +DEF_HELPER(void, glue(helper_pminud, SUFFIX), (Reg *d, Reg *s)) | |
297 | +DEF_HELPER(void, glue(helper_pmaxsb, SUFFIX), (Reg *d, Reg *s)) | |
298 | +DEF_HELPER(void, glue(helper_pmaxsd, SUFFIX), (Reg *d, Reg *s)) | |
299 | +DEF_HELPER(void, glue(helper_pmaxuw, SUFFIX), (Reg *d, Reg *s)) | |
300 | +DEF_HELPER(void, glue(helper_pmaxud, SUFFIX), (Reg *d, Reg *s)) | |
301 | +DEF_HELPER(void, glue(helper_pmulld, SUFFIX), (Reg *d, Reg *s)) | |
302 | +DEF_HELPER(void, glue(helper_phminposuw, SUFFIX), (Reg *d, Reg *s)) | |
303 | +DEF_HELPER(void, glue(helper_roundps, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | |
304 | +DEF_HELPER(void, glue(helper_roundpd, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | |
305 | +DEF_HELPER(void, glue(helper_roundss, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | |
306 | +DEF_HELPER(void, glue(helper_roundsd, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | |
307 | +DEF_HELPER(void, glue(helper_blendps, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) | |
308 | +DEF_HELPER(void, glue(helper_blendpd, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) | |
309 | +DEF_HELPER(void, glue(helper_pblendw, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) | |
310 | +DEF_HELPER(void, glue(helper_dpps, SUFFIX), (Reg *d, Reg *s, uint32_t mask)) | |
311 | +DEF_HELPER(void, glue(helper_dppd, SUFFIX), (Reg *d, Reg *s, uint32_t mask)) | |
312 | +DEF_HELPER(void, glue(helper_mpsadbw, SUFFIX), (Reg *d, Reg *s, uint32_t off)) | |
313 | +#endif | |
314 | + | |
315 | +/* SSE4.2 op helpers */ | |
316 | +#if SHIFT == 1 | |
317 | +DEF_HELPER(void, glue(helper_pcmpgtq, SUFFIX), (Reg *d, Reg *s)) | |
318 | +DEF_HELPER(void, glue(helper_pcmpestri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | |
319 | +DEF_HELPER(void, glue(helper_pcmpestrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | |
320 | +DEF_HELPER(void, glue(helper_pcmpistri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | |
321 | +DEF_HELPER(void, glue(helper_pcmpistrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | |
322 | +DEF_HELPER(target_ulong, helper_crc32, | |
323 | + (uint32_t crc1, target_ulong msg, uint32_t len)) | |
324 | +DEF_HELPER(target_ulong, helper_popcnt, (target_ulong n, uint32_t type)) | |
325 | +#endif | |
326 | + | |
272 | 327 | #undef SHIFT |
273 | 328 | #undef Reg |
274 | 329 | #undef SUFFIX | ... | ... |
target-i386/translate.c
... | ... | @@ -2140,7 +2140,7 @@ static void gen_add_A0_ds_seg(DisasContext *s) |
2140 | 2140 | } |
2141 | 2141 | } |
2142 | 2142 | |
2143 | -/* generate modrm memory load or store of 'reg'. TMP0 is used if reg != | |
2143 | +/* generate modrm memory load or store of 'reg'. TMP0 is used if reg == | |
2144 | 2144 | OR_TMP0 */ |
2145 | 2145 | static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store) |
2146 | 2146 | { |
... | ... | @@ -2770,8 +2770,8 @@ static void *sse_op_table1[256][4] = { |
2770 | 2770 | [0xc2] = SSE_FOP(cmpeq), |
2771 | 2771 | [0xc6] = { helper_shufps, helper_shufpd }, |
2772 | 2772 | |
2773 | - [0x38] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ | |
2774 | - [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ | |
2773 | + [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */ | |
2774 | + [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */ | |
2775 | 2775 | |
2776 | 2776 | /* MMX ops and their SSE extensions */ |
2777 | 2777 | [0x60] = MMX_OP2(punpcklbw), |
... | ... | @@ -2924,26 +2924,85 @@ static void *sse_op_table5[256] = { |
2924 | 2924 | [0xbf] = helper_pavgb_mmx /* pavgusb */ |
2925 | 2925 | }; |
2926 | 2926 | |
2927 | -static void *sse_op_table6[256][2] = { | |
2928 | - [0x00] = MMX_OP2(pshufb), | |
2929 | - [0x01] = MMX_OP2(phaddw), | |
2930 | - [0x02] = MMX_OP2(phaddd), | |
2931 | - [0x03] = MMX_OP2(phaddsw), | |
2932 | - [0x04] = MMX_OP2(pmaddubsw), | |
2933 | - [0x05] = MMX_OP2(phsubw), | |
2934 | - [0x06] = MMX_OP2(phsubd), | |
2935 | - [0x07] = MMX_OP2(phsubsw), | |
2936 | - [0x08] = MMX_OP2(psignb), | |
2937 | - [0x09] = MMX_OP2(psignw), | |
2938 | - [0x0a] = MMX_OP2(psignd), | |
2939 | - [0x0b] = MMX_OP2(pmulhrsw), | |
2940 | - [0x1c] = MMX_OP2(pabsb), | |
2941 | - [0x1d] = MMX_OP2(pabsw), | |
2942 | - [0x1e] = MMX_OP2(pabsd), | |
2927 | +struct sse_op_helper_s { | |
2928 | + void *op[2]; uint32_t ext_mask; | |
2929 | +}; | |
2930 | +#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 } | |
2931 | +#define SSE41_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE41 } | |
2932 | +#define SSE42_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE42 } | |
2933 | +#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 } | |
2934 | +static struct sse_op_helper_s sse_op_table6[256] = { | |
2935 | + [0x00] = SSSE3_OP(pshufb), | |
2936 | + [0x01] = SSSE3_OP(phaddw), | |
2937 | + [0x02] = SSSE3_OP(phaddd), | |
2938 | + [0x03] = SSSE3_OP(phaddsw), | |
2939 | + [0x04] = SSSE3_OP(pmaddubsw), | |
2940 | + [0x05] = SSSE3_OP(phsubw), | |
2941 | + [0x06] = SSSE3_OP(phsubd), | |
2942 | + [0x07] = SSSE3_OP(phsubsw), | |
2943 | + [0x08] = SSSE3_OP(psignb), | |
2944 | + [0x09] = SSSE3_OP(psignw), | |
2945 | + [0x0a] = SSSE3_OP(psignd), | |
2946 | + [0x0b] = SSSE3_OP(pmulhrsw), | |
2947 | + [0x10] = SSE41_OP(pblendvb), | |
2948 | + [0x14] = SSE41_OP(blendvps), | |
2949 | + [0x15] = SSE41_OP(blendvpd), | |
2950 | + [0x17] = SSE41_OP(ptest), | |
2951 | + [0x1c] = SSSE3_OP(pabsb), | |
2952 | + [0x1d] = SSSE3_OP(pabsw), | |
2953 | + [0x1e] = SSSE3_OP(pabsd), | |
2954 | + [0x20] = SSE41_OP(pmovsxbw), | |
2955 | + [0x21] = SSE41_OP(pmovsxbd), | |
2956 | + [0x22] = SSE41_OP(pmovsxbq), | |
2957 | + [0x23] = SSE41_OP(pmovsxwd), | |
2958 | + [0x24] = SSE41_OP(pmovsxwq), | |
2959 | + [0x25] = SSE41_OP(pmovsxdq), | |
2960 | + [0x28] = SSE41_OP(pmuldq), | |
2961 | + [0x29] = SSE41_OP(pcmpeqq), | |
2962 | + [0x2a] = SSE41_SPECIAL, /* movntqda */ | |
2963 | + [0x2b] = SSE41_OP(packusdw), | |
2964 | + [0x30] = SSE41_OP(pmovzxbw), | |
2965 | + [0x31] = SSE41_OP(pmovzxbd), | |
2966 | + [0x32] = SSE41_OP(pmovzxbq), | |
2967 | + [0x33] = SSE41_OP(pmovzxwd), | |
2968 | + [0x34] = SSE41_OP(pmovzxwq), | |
2969 | + [0x35] = SSE41_OP(pmovzxdq), | |
2970 | + [0x37] = SSE42_OP(pcmpgtq), | |
2971 | + [0x38] = SSE41_OP(pminsb), | |
2972 | + [0x39] = SSE41_OP(pminsd), | |
2973 | + [0x3a] = SSE41_OP(pminuw), | |
2974 | + [0x3b] = SSE41_OP(pminud), | |
2975 | + [0x3c] = SSE41_OP(pmaxsb), | |
2976 | + [0x3d] = SSE41_OP(pmaxsd), | |
2977 | + [0x3e] = SSE41_OP(pmaxuw), | |
2978 | + [0x3f] = SSE41_OP(pmaxud), | |
2979 | + [0x40] = SSE41_OP(pmulld), | |
2980 | + [0x41] = SSE41_OP(phminposuw), | |
2943 | 2981 | }; |
2944 | 2982 | |
2945 | -static void *sse_op_table7[256][2] = { | |
2946 | - [0x0f] = MMX_OP2(palignr), | |
2983 | +static struct sse_op_helper_s sse_op_table7[256] = { | |
2984 | + [0x08] = SSE41_OP(roundps), | |
2985 | + [0x09] = SSE41_OP(roundpd), | |
2986 | + [0x0a] = SSE41_OP(roundss), | |
2987 | + [0x0b] = SSE41_OP(roundsd), | |
2988 | + [0x0c] = SSE41_OP(blendps), | |
2989 | + [0x0d] = SSE41_OP(blendpd), | |
2990 | + [0x0e] = SSE41_OP(pblendw), | |
2991 | + [0x0f] = SSSE3_OP(palignr), | |
2992 | + [0x14] = SSE41_SPECIAL, /* pextrb */ | |
2993 | + [0x15] = SSE41_SPECIAL, /* pextrw */ | |
2994 | + [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */ | |
2995 | + [0x17] = SSE41_SPECIAL, /* extractps */ | |
2996 | + [0x20] = SSE41_SPECIAL, /* pinsrb */ | |
2997 | + [0x21] = SSE41_SPECIAL, /* insertps */ | |
2998 | + [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */ | |
2999 | + [0x40] = SSE41_OP(dpps), | |
3000 | + [0x41] = SSE41_OP(dppd), | |
3001 | + [0x42] = SSE41_OP(mpsadbw), | |
3002 | + [0x60] = SSE42_OP(pcmpestrm), | |
3003 | + [0x61] = SSE42_OP(pcmpestri), | |
3004 | + [0x62] = SSE42_OP(pcmpistrm), | |
3005 | + [0x63] = SSE42_OP(pcmpistri), | |
2947 | 3006 | }; |
2948 | 3007 | |
2949 | 3008 | static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) |
... | ... | @@ -3511,18 +3570,20 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) |
3511 | 3570 | break; |
3512 | 3571 | case 0x038: |
3513 | 3572 | case 0x138: |
3514 | - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) | |
3515 | - goto illegal_op; | |
3516 | - | |
3517 | 3573 | b = modrm; |
3518 | 3574 | modrm = ldub_code(s->pc++); |
3519 | 3575 | rm = modrm & 7; |
3520 | 3576 | reg = ((modrm >> 3) & 7) | rex_r; |
3521 | 3577 | mod = (modrm >> 6) & 3; |
3522 | 3578 | |
3523 | - sse_op2 = sse_op_table6[b][b1]; | |
3579 | + if (s->prefix & PREFIX_REPNZ) | |
3580 | + goto crc32; | |
3581 | + | |
3582 | + sse_op2 = sse_op_table6[b].op[b1]; | |
3524 | 3583 | if (!sse_op2) |
3525 | 3584 | goto illegal_op; |
3585 | + if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask)) | |
3586 | + goto illegal_op; | |
3526 | 3587 | |
3527 | 3588 | if (b1) { |
3528 | 3589 | op1_offset = offsetof(CPUX86State,xmm_regs[reg]); |
... | ... | @@ -3531,7 +3592,32 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) |
3531 | 3592 | } else { |
3532 | 3593 | op2_offset = offsetof(CPUX86State,xmm_t0); |
3533 | 3594 | gen_lea_modrm(s, modrm, ®_addr, &offset_addr); |
3534 | - gen_ldo_env_A0(s->mem_index, op2_offset); | |
3595 | + switch (b) { | |
3596 | + case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ | |
3597 | + case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ | |
3598 | + case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ | |
3599 | + gen_ldq_env_A0(s->mem_index, op2_offset + | |
3600 | + offsetof(XMMReg, XMM_Q(0))); | |
3601 | + break; | |
3602 | + case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */ | |
3603 | + case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */ | |
3604 | + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, | |
3605 | + (s->mem_index >> 2) - 1); | |
3606 | + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset + | |
3607 | + offsetof(XMMReg, XMM_L(0))); | |
3608 | + break; | |
3609 | + case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */ | |
3610 | + tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0, | |
3611 | + (s->mem_index >> 2) - 1); | |
3612 | + tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset + | |
3613 | + offsetof(XMMReg, XMM_W(0))); | |
3614 | + break; | |
3615 | + case 0x2a: /* movntqda */ | |
3616 | + gen_ldo_env_A0(s->mem_index, op1_offset); | |
3617 | + return; | |
3618 | + default: | |
3619 | + gen_ldo_env_A0(s->mem_index, op2_offset); | |
3620 | + } | |
3535 | 3621 | } |
3536 | 3622 | } else { |
3537 | 3623 | op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); |
... | ... | @@ -3543,24 +3629,177 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) |
3543 | 3629 | gen_ldq_env_A0(s->mem_index, op2_offset); |
3544 | 3630 | } |
3545 | 3631 | } |
3632 | + if (sse_op2 == SSE_SPECIAL) | |
3633 | + goto illegal_op; | |
3634 | + | |
3546 | 3635 | tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); |
3547 | 3636 | tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); |
3548 | 3637 | tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1); |
3638 | + | |
3639 | + if (b == 0x17) | |
3640 | + s->cc_op = CC_OP_EFLAGS; | |
3549 | 3641 | break; |
3550 | - case 0x03a: | |
3551 | - case 0x13a: | |
3552 | - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) | |
3642 | + case 0x338: /* crc32 */ | |
3643 | + crc32: | |
3644 | + b = modrm; | |
3645 | + modrm = ldub_code(s->pc++); | |
3646 | + reg = ((modrm >> 3) & 7) | rex_r; | |
3647 | + | |
3648 | + if (b != 0xf0 && b != 0xf1) | |
3649 | + goto illegal_op; | |
3650 | + if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) | |
3553 | 3651 | goto illegal_op; |
3554 | 3652 | |
3653 | + if (b == 0xf0) | |
3654 | + ot = OT_BYTE; | |
3655 | + else if (b == 0xf1 && s->dflag != 2) | |
3656 | + if (s->prefix & PREFIX_DATA) | |
3657 | + ot = OT_WORD; | |
3658 | + else | |
3659 | + ot = OT_LONG; | |
3660 | + else | |
3661 | + ot = OT_QUAD; | |
3662 | + | |
3663 | + gen_op_mov_TN_reg(OT_LONG, 0, reg); | |
3664 | + tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); | |
3665 | + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); | |
3666 | + tcg_gen_helper_1_3(helper_crc32, cpu_T[0], cpu_tmp2_i32, | |
3667 | + cpu_T[0], tcg_const_i32(8 << ot)); | |
3668 | + | |
3669 | + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; | |
3670 | + gen_op_mov_reg_T0(ot, reg); | |
3671 | + break; | |
3672 | + case 0x03a: | |
3673 | + case 0x13a: | |
3555 | 3674 | b = modrm; |
3556 | 3675 | modrm = ldub_code(s->pc++); |
3557 | 3676 | rm = modrm & 7; |
3558 | 3677 | reg = ((modrm >> 3) & 7) | rex_r; |
3559 | 3678 | mod = (modrm >> 6) & 3; |
3560 | 3679 | |
3561 | - sse_op2 = sse_op_table7[b][b1]; | |
3680 | + sse_op2 = sse_op_table7[b].op[b1]; | |
3562 | 3681 | if (!sse_op2) |
3563 | 3682 | goto illegal_op; |
3683 | + if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask)) | |
3684 | + goto illegal_op; | |
3685 | + | |
3686 | + if (sse_op2 == SSE_SPECIAL) { | |
3687 | + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; | |
3688 | + rm = (modrm & 7) | REX_B(s); | |
3689 | + if (mod != 3) | |
3690 | + gen_lea_modrm(s, modrm, ®_addr, &offset_addr); | |
3691 | + reg = ((modrm >> 3) & 7) | rex_r; | |
3692 | + val = ldub_code(s->pc++); | |
3693 | + switch (b) { | |
3694 | + case 0x14: /* pextrb */ | |
3695 | + tcg_gen_ld8u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | |
3696 | + xmm_regs[reg].XMM_B(val & 15))); | |
3697 | + if (mod == 3) | |
3698 | + gen_op_mov_reg_T0(ot, rm); | |
3699 | + else | |
3700 | + tcg_gen_qemu_st8(cpu_T[0], cpu_A0, | |
3701 | + (s->mem_index >> 2) - 1); | |
3702 | + break; | |
3703 | + case 0x15: /* pextrw */ | |
3704 | + tcg_gen_ld16u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | |
3705 | + xmm_regs[reg].XMM_W(val & 7))); | |
3706 | + if (mod == 3) | |
3707 | + gen_op_mov_reg_T0(ot, rm); | |
3708 | + else | |
3709 | + tcg_gen_qemu_st16(cpu_T[0], cpu_A0, | |
3710 | + (s->mem_index >> 2) - 1); | |
3711 | + break; | |
3712 | + case 0x16: | |
3713 | + if (ot == OT_LONG) { /* pextrd */ | |
3714 | + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, | |
3715 | + offsetof(CPUX86State, | |
3716 | + xmm_regs[reg].XMM_L(val & 3))); | |
3717 | + if (mod == 3) | |
3718 | + gen_op_mov_reg_v(ot, rm, cpu_tmp2_i32); | |
3719 | + else | |
3720 | + tcg_gen_qemu_st32(cpu_tmp2_i32, cpu_A0, | |
3721 | + (s->mem_index >> 2) - 1); | |
3722 | + } else { /* pextrq */ | |
3723 | + tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, | |
3724 | + offsetof(CPUX86State, | |
3725 | + xmm_regs[reg].XMM_Q(val & 1))); | |
3726 | + if (mod == 3) | |
3727 | + gen_op_mov_reg_v(ot, rm, cpu_tmp1_i64); | |
3728 | + else | |
3729 | + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, | |
3730 | + (s->mem_index >> 2) - 1); | |
3731 | + } | |
3732 | + break; | |
3733 | + case 0x17: /* extractps */ | |
3734 | + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | |
3735 | + xmm_regs[reg].XMM_L(val & 3))); | |
3736 | + if (mod == 3) | |
3737 | + gen_op_mov_reg_T0(ot, rm); | |
3738 | + else | |
3739 | + tcg_gen_qemu_st32(cpu_T[0], cpu_A0, | |
3740 | + (s->mem_index >> 2) - 1); | |
3741 | + break; | |
3742 | + case 0x20: /* pinsrb */ | |
3743 | + if (mod == 3) | |
3744 | + gen_op_mov_TN_reg(OT_LONG, 0, rm); | |
3745 | + else | |
3746 | + tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0, | |
3747 | + (s->mem_index >> 2) - 1); | |
3748 | + tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | |
3749 | + xmm_regs[reg].XMM_B(val & 15))); | |
3750 | + break; | |
3751 | + case 0x21: /* insertps */ | |
3752 | + if (mod == 3) | |
3753 | + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, | |
3754 | + offsetof(CPUX86State,xmm_regs[rm] | |
3755 | + .XMM_L((val >> 6) & 3))); | |
3756 | + else | |
3757 | + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, | |
3758 | + (s->mem_index >> 2) - 1); | |
3759 | + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, | |
3760 | + offsetof(CPUX86State,xmm_regs[reg] | |
3761 | + .XMM_L((val >> 4) & 3))); | |
3762 | + if ((val >> 0) & 1) | |
3763 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | |
3764 | + cpu_env, offsetof(CPUX86State, | |
3765 | + xmm_regs[reg].XMM_L(0))); | |
3766 | + if ((val >> 1) & 1) | |
3767 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | |
3768 | + cpu_env, offsetof(CPUX86State, | |
3769 | + xmm_regs[reg].XMM_L(1))); | |
3770 | + if ((val >> 2) & 1) | |
3771 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | |
3772 | + cpu_env, offsetof(CPUX86State, | |
3773 | + xmm_regs[reg].XMM_L(2))); | |
3774 | + if ((val >> 3) & 1) | |
3775 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | |
3776 | + cpu_env, offsetof(CPUX86State, | |
3777 | + xmm_regs[reg].XMM_L(3))); | |
3778 | + break; | |
3779 | + case 0x22: | |
3780 | + if (ot == OT_LONG) { /* pinsrd */ | |
3781 | + if (mod == 3) | |
3782 | + gen_op_mov_v_reg(ot, cpu_tmp2_i32, rm); | |
3783 | + else | |
3784 | + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, | |
3785 | + (s->mem_index >> 2) - 1); | |
3786 | + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, | |
3787 | + offsetof(CPUX86State, | |
3788 | + xmm_regs[reg].XMM_L(val & 3))); | |
3789 | + } else { /* pinsrq */ | |
3790 | + if (mod == 3) | |
3791 | + gen_op_mov_v_reg(ot, cpu_tmp1_i64, rm); | |
3792 | + else | |
3793 | + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, | |
3794 | + (s->mem_index >> 2) - 1); | |
3795 | + tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, | |
3796 | + offsetof(CPUX86State, | |
3797 | + xmm_regs[reg].XMM_Q(val & 1))); | |
3798 | + } | |
3799 | + break; | |
3800 | + } | |
3801 | + return; | |
3802 | + } | |
3564 | 3803 | |
3565 | 3804 | if (b1) { |
3566 | 3805 | op1_offset = offsetof(CPUX86State,xmm_regs[reg]); |
... | ... | @@ -3583,6 +3822,14 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) |
3583 | 3822 | } |
3584 | 3823 | val = ldub_code(s->pc++); |
3585 | 3824 | |
3825 | + if ((b & 0xfc) == 0x60) { /* pcmpXstrX */ | |
3826 | + s->cc_op = CC_OP_EFLAGS; | |
3827 | + | |
3828 | + if (s->dflag == 2) | |
3829 | + /* The helper must use entire 64-bit gp registers */ | |
3830 | + val |= 1 << 8; | |
3831 | + } | |
3832 | + | |
3586 | 3833 | tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); |
3587 | 3834 | tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); |
3588 | 3835 | tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, tcg_const_i32(val)); |
... | ... | @@ -7094,7 +7341,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) |
7094 | 7341 | gen_eob(s); |
7095 | 7342 | } |
7096 | 7343 | break; |
7097 | - /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3 support */ | |
7344 | + /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */ | |
7098 | 7345 | case 0x1c3: /* MOVNTI reg, mem */ |
7099 | 7346 | if (!(s->cpuid_features & CPUID_SSE2)) |
7100 | 7347 | goto illegal_op; |
... | ... | @@ -7202,6 +7449,28 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) |
7202 | 7449 | tcg_gen_helper_0_0(helper_rsm); |
7203 | 7450 | gen_eob(s); |
7204 | 7451 | break; |
7452 | + case 0x1b8: /* SSE4.2 popcnt */ | |
7453 | + if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) != | |
7454 | + PREFIX_REPZ) | |
7455 | + goto illegal_op; | |
7456 | + if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT)) | |
7457 | + goto illegal_op; | |
7458 | + | |
7459 | + modrm = ldub_code(s->pc++); | |
7460 | + reg = ((modrm >> 3) & 7); | |
7461 | + | |
7462 | + if (s->prefix & PREFIX_DATA) | |
7463 | + ot = OT_WORD; | |
7464 | + else if (s->dflag != 2) | |
7465 | + ot = OT_LONG; | |
7466 | + else | |
7467 | + ot = OT_QUAD; | |
7468 | + | |
7469 | + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); | |
7470 | + tcg_gen_helper_1_2(helper_popcnt, | |
7471 | + cpu_T[0], cpu_T[0], tcg_const_i32(ot)); | |
7472 | + gen_op_mov_reg_T0(ot, reg); | |
7473 | + break; | |
7205 | 7474 | case 0x10e ... 0x10f: |
7206 | 7475 | /* 3DNow! instructions, ignore prefixes */ |
7207 | 7476 | s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA); | ... | ... |
tests/test-i386-ssse3.c
1 | 1 | /* See if various MMX/SSE SSSE3 instructions give expected results */ |
2 | 2 | #include <stdio.h> |
3 | 3 | #include <string.h> |
4 | +#include <stdint.h> | |
4 | 5 | |
5 | 6 | int main(int argc, char *argv[]) { |
6 | 7 | char hello[16]; |
... | ... | @@ -9,9 +10,11 @@ int main(int argc, char *argv[]) { |
9 | 10 | |
10 | 11 | uint64_t a = 0x0000000000090007; |
11 | 12 | uint64_t b = 0x0000000000000000; |
13 | + uint32_t c; | |
14 | + uint16_t d; | |
12 | 15 | |
13 | - const char c[16] = "LLOaaaaaaaaaaaaa"; | |
14 | - const char d[16] = "aaaaaaaaaaaaaaHE"; | |
16 | + const char e[16] = "LLOaaaaaaaaaaaaa"; | |
17 | + const char f[16] = "aaaaaaaaaaaaaaHE"; | |
15 | 18 | |
16 | 19 | /* pshufb mm1/xmm1, mm2/xmm2 */ |
17 | 20 | asm volatile ("movq (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1"); |
... | ... | @@ -33,10 +36,22 @@ int main(int argc, char *argv[]) { |
33 | 36 | printf("%i - %i = %i\n", 9, 7, -(int16_t) a); |
34 | 37 | |
35 | 38 | /* palignr mm1/xmm1, m64/m128, imm8 */ |
36 | - asm volatile ("movdqa (%0), %%xmm0" : : "r" (c) : "xmm0"); | |
37 | - asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (d)); | |
39 | + asm volatile ("movdqa (%0), %%xmm0" : : "r" (e) : "xmm0"); | |
40 | + asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (f)); | |
38 | 41 | asm volatile ("movdqa %%xmm0, (%0)" : : "r" (hello)); |
39 | 42 | printf("%5.5s\n", hello); |
40 | 43 | |
44 | +#if 1 /* SSE4 */ | |
45 | + /* popcnt r64, r/m64 */ | |
46 | + asm volatile ("movq $0x8421000010009c63, %%rax" : : : "rax"); | |
47 | + asm volatile ("popcnt %%ax, %%dx" : : : "dx"); | |
48 | + asm volatile ("popcnt %%eax, %%ecx" : : : "ecx"); | |
49 | + asm volatile ("popcnt %rax, %rax"); | |
50 | + asm volatile ("movq %%rax, %0" : "=m" (a)); | |
51 | + asm volatile ("movl %%ecx, %0" : "=m" (c)); | |
52 | + asm volatile ("movw %%dx, %0" : "=m" (d)); | |
53 | + printf("%i = %i\n%i = %i = %i\n", 13, (int) a, 9, c, d + 1); | |
54 | +#endif | |
55 | + | |
41 | 56 | return 0; |
42 | 57 | } | ... | ... |