Commit 222a3336ecbe177da082f9ac20f9614d6d23c721

Authored by balrog
1 parent 06adb549

Implement SSE4.1, SSE4.2 (x86).

This adds support for CPUID_EXT_SSE41, CPUID_EXT_SSE42, CPUID_EXT_POPCNT
extensions.  Most instructions haven't been tested yet.


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5411 c046a42c-6fe2-441c-8c8c-71466251a162
target-i386/ops_sse.h
1 1 /*
2   - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
  2 + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3 3 *
4 4 * Copyright (c) 2005 Fabrice Bellard
  5 + * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com>
5 6 *
6 7 * This library is free software; you can redistribute it and/or
7 8 * modify it under the terms of the GNU Lesser General Public
... ... @@ -1420,6 +1421,621 @@ void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
1420 1421 *d = r;
1421 1422 }
1422 1423  
  1424 +#define XMM0 env->xmm_regs[0]
  1425 +
  1426 +#if SHIFT == 1
  1427 +#define SSE_HELPER_V(name, elem, num, F)\
  1428 +void glue(name, SUFFIX) (Reg *d, Reg *s)\
  1429 +{\
  1430 + d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\
  1431 + d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\
  1432 + if (num > 2) {\
  1433 + d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\
  1434 + d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\
  1435 + if (num > 4) {\
  1436 + d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\
  1437 + d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\
  1438 + d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\
  1439 + d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\
  1440 + if (num > 8) {\
  1441 + d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\
  1442 + d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\
  1443 + d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\
  1444 + d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\
  1445 + d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\
  1446 + d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\
  1447 + d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\
  1448 + d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\
  1449 + }\
  1450 + }\
  1451 + }\
  1452 +}
  1453 +
  1454 +#define SSE_HELPER_I(name, elem, num, F)\
  1455 +void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\
  1456 +{\
  1457 + d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\
  1458 + d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\
  1459 + if (num > 2) {\
  1460 + d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\
  1461 + d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\
  1462 + if (num > 4) {\
  1463 + d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\
  1464 + d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\
  1465 + d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\
  1466 + d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\
  1467 + if (num > 8) {\
  1468 + d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\
  1469 + d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\
  1470 + d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\
  1471 + d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\
  1472 + d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\
  1473 + d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\
  1474 + d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\
  1475 + d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\
  1476 + }\
  1477 + }\
  1478 + }\
  1479 +}
  1480 +
  1481 +/* SSE4.1 op helpers */
  1482 +#define FBLENDVB(d, s, m) (m & 0x80) ? s : d
  1483 +#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d
  1484 +#define FBLENDVPD(d, s, m) (m & 0x8000000000000000) ? s : d
  1485 +SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
  1486 +SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
  1487 +SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
  1488 +
  1489 +void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s)
  1490 +{
  1491 + uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1));
  1492 + uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
  1493 +
  1494 + CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
  1495 +}
  1496 +
  1497 +#define SSE_HELPER_F(name, elem, num, F)\
  1498 +void glue(name, SUFFIX) (Reg *d, Reg *s)\
  1499 +{\
  1500 + d->elem(0) = F(0);\
  1501 + d->elem(1) = F(1);\
  1502 + d->elem(2) = F(2);\
  1503 + d->elem(3) = F(3);\
  1504 + if (num > 3) {\
  1505 + d->elem(4) = F(4);\
  1506 + d->elem(5) = F(5);\
  1507 + if (num > 5) {\
  1508 + d->elem(6) = F(6);\
  1509 + d->elem(7) = F(7);\
  1510 + }\
  1511 + }\
  1512 +}
  1513 +
  1514 +SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
  1515 +SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
  1516 +SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
  1517 +SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
  1518 +SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
  1519 +SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
  1520 +SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
  1521 +SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
  1522 +SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
  1523 +SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
  1524 +SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
  1525 +SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
  1526 +
  1527 +void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s)
  1528 +{
  1529 + d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0);
  1530 + d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2);
  1531 +}
  1532 +
  1533 +#define FCMPEQQ(d, s) d == s ? -1 : 0
  1534 +SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
  1535 +
  1536 +void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s)
  1537 +{
  1538 + d->W(0) = satuw((int32_t) d->L(0));
  1539 + d->W(1) = satuw((int32_t) d->L(1));
  1540 + d->W(2) = satuw((int32_t) d->L(2));
  1541 + d->W(3) = satuw((int32_t) d->L(3));
  1542 + d->W(4) = satuw((int32_t) s->L(0));
  1543 + d->W(5) = satuw((int32_t) s->L(1));
  1544 + d->W(6) = satuw((int32_t) s->L(2));
  1545 + d->W(7) = satuw((int32_t) s->L(3));
  1546 +}
  1547 +
  1548 +#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s)
  1549 +#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s)
  1550 +#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s)
  1551 +#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s)
  1552 +SSE_HELPER_B(helper_pminsb, FMINSB)
  1553 +SSE_HELPER_L(helper_pminsd, FMINSD)
  1554 +SSE_HELPER_W(helper_pminuw, MIN)
  1555 +SSE_HELPER_L(helper_pminud, MIN)
  1556 +SSE_HELPER_B(helper_pmaxsb, FMAXSB)
  1557 +SSE_HELPER_L(helper_pmaxsd, FMAXSD)
  1558 +SSE_HELPER_W(helper_pmaxuw, MAX)
  1559 +SSE_HELPER_L(helper_pmaxud, MAX)
  1560 +
  1561 +#define FMULLD(d, s) (int32_t) d * (int32_t) s
  1562 +SSE_HELPER_L(helper_pmulld, FMULLD)
  1563 +
  1564 +void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s)
  1565 +{
  1566 + int idx = 0;
  1567 +
  1568 + if (s->W(1) < s->W(idx))
  1569 + idx = 1;
  1570 + if (s->W(2) < s->W(idx))
  1571 + idx = 2;
  1572 + if (s->W(3) < s->W(idx))
  1573 + idx = 3;
  1574 + if (s->W(4) < s->W(idx))
  1575 + idx = 4;
  1576 + if (s->W(5) < s->W(idx))
  1577 + idx = 5;
  1578 + if (s->W(6) < s->W(idx))
  1579 + idx = 6;
  1580 + if (s->W(7) < s->W(idx))
  1581 + idx = 7;
  1582 +
  1583 + d->Q(1) = 0;
  1584 + d->L(1) = 0;
  1585 + d->W(1) = idx;
  1586 + d->W(0) = s->W(idx);
  1587 +}
  1588 +
  1589 +void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
  1590 +{
  1591 + signed char prev_rounding_mode;
  1592 +
  1593 + prev_rounding_mode = env->sse_status.float_rounding_mode;
  1594 + if (!(mode & (1 << 2)))
  1595 + switch (mode & 3) {
  1596 + case 0:
  1597 + set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
  1598 + break;
  1599 + case 1:
  1600 + set_float_rounding_mode(float_round_down, &env->sse_status);
  1601 + break;
  1602 + case 2:
  1603 + set_float_rounding_mode(float_round_up, &env->sse_status);
  1604 + break;
  1605 + case 3:
  1606 + set_float_rounding_mode(float_round_to_zero, &env->sse_status);
  1607 + break;
  1608 + }
  1609 +
  1610 + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
  1611 + d->L(1) = float64_round_to_int(s->L(1), &env->sse_status);
  1612 + d->L(2) = float64_round_to_int(s->L(2), &env->sse_status);
  1613 + d->L(3) = float64_round_to_int(s->L(3), &env->sse_status);
  1614 +
  1615 +#if 0 /* TODO */
  1616 + if (mode & (1 << 3))
  1617 + set_float_exception_flags(
  1618 + get_float_exception_flags(&env->sse_status) &
  1619 + ~float_flag_inexact,
  1620 + &env->sse_status);
  1621 +#endif
  1622 + env->sse_status.float_rounding_mode = prev_rounding_mode;
  1623 +}
  1624 +
  1625 +void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
  1626 +{
  1627 + signed char prev_rounding_mode;
  1628 +
  1629 + prev_rounding_mode = env->sse_status.float_rounding_mode;
  1630 + if (!(mode & (1 << 2)))
  1631 + switch (mode & 3) {
  1632 + case 0:
  1633 + set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
  1634 + break;
  1635 + case 1:
  1636 + set_float_rounding_mode(float_round_down, &env->sse_status);
  1637 + break;
  1638 + case 2:
  1639 + set_float_rounding_mode(float_round_up, &env->sse_status);
  1640 + break;
  1641 + case 3:
  1642 + set_float_rounding_mode(float_round_to_zero, &env->sse_status);
  1643 + break;
  1644 + }
  1645 +
  1646 + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
  1647 + d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status);
  1648 +
  1649 +#if 0 /* TODO */
  1650 + if (mode & (1 << 3))
  1651 + set_float_exception_flags(
  1652 + get_float_exception_flags(&env->sse_status) &
  1653 + ~float_flag_inexact,
  1654 + &env->sse_status);
  1655 +#endif
  1656 + env->sse_status.float_rounding_mode = prev_rounding_mode;
  1657 +}
  1658 +
  1659 +void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
  1660 +{
  1661 + signed char prev_rounding_mode;
  1662 +
  1663 + prev_rounding_mode = env->sse_status.float_rounding_mode;
  1664 + if (!(mode & (1 << 2)))
  1665 + switch (mode & 3) {
  1666 + case 0:
  1667 + set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
  1668 + break;
  1669 + case 1:
  1670 + set_float_rounding_mode(float_round_down, &env->sse_status);
  1671 + break;
  1672 + case 2:
  1673 + set_float_rounding_mode(float_round_up, &env->sse_status);
  1674 + break;
  1675 + case 3:
  1676 + set_float_rounding_mode(float_round_to_zero, &env->sse_status);
  1677 + break;
  1678 + }
  1679 +
  1680 + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
  1681 +
  1682 +#if 0 /* TODO */
  1683 + if (mode & (1 << 3))
  1684 + set_float_exception_flags(
  1685 + get_float_exception_flags(&env->sse_status) &
  1686 + ~float_flag_inexact,
  1687 + &env->sse_status);
  1688 +#endif
  1689 + env->sse_status.float_rounding_mode = prev_rounding_mode;
  1690 +}
  1691 +
  1692 +void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
  1693 +{
  1694 + signed char prev_rounding_mode;
  1695 +
  1696 + prev_rounding_mode = env->sse_status.float_rounding_mode;
  1697 + if (!(mode & (1 << 2)))
  1698 + switch (mode & 3) {
  1699 + case 0:
  1700 + set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
  1701 + break;
  1702 + case 1:
  1703 + set_float_rounding_mode(float_round_down, &env->sse_status);
  1704 + break;
  1705 + case 2:
  1706 + set_float_rounding_mode(float_round_up, &env->sse_status);
  1707 + break;
  1708 + case 3:
  1709 + set_float_rounding_mode(float_round_to_zero, &env->sse_status);
  1710 + break;
  1711 + }
  1712 +
  1713 + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
  1714 +
  1715 +#if 0 /* TODO */
  1716 + if (mode & (1 << 3))
  1717 + set_float_exception_flags(
  1718 + get_float_exception_flags(&env->sse_status) &
  1719 + ~float_flag_inexact,
  1720 + &env->sse_status);
  1721 +#endif
  1722 + env->sse_status.float_rounding_mode = prev_rounding_mode;
  1723 +}
  1724 +
  1725 +#define FBLENDP(d, s, m) m ? s : d
  1726 +SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
  1727 +SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
  1728 +SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
  1729 +
  1730 +void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
  1731 +{
  1732 + float32 iresult = 0 /*float32_zero*/;
  1733 +
  1734 + if (mask & (1 << 4))
  1735 + iresult = float32_add(iresult,
  1736 + float32_mul(d->L(0), s->L(0), &env->sse_status),
  1737 + &env->sse_status);
  1738 + if (mask & (1 << 5))
  1739 + iresult = float32_add(iresult,
  1740 + float32_mul(d->L(1), s->L(1), &env->sse_status),
  1741 + &env->sse_status);
  1742 + if (mask & (1 << 6))
  1743 + iresult = float32_add(iresult,
  1744 + float32_mul(d->L(2), s->L(2), &env->sse_status),
  1745 + &env->sse_status);
  1746 + if (mask & (1 << 7))
  1747 + iresult = float32_add(iresult,
  1748 + float32_mul(d->L(3), s->L(3), &env->sse_status),
  1749 + &env->sse_status);
  1750 + d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/;
  1751 + d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/;
  1752 + d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/;
  1753 + d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/;
  1754 +}
  1755 +
  1756 +void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
  1757 +{
  1758 + float64 iresult = 0 /*float64_zero*/;
  1759 +
  1760 + if (mask & (1 << 4))
  1761 + iresult = float64_add(iresult,
  1762 + float64_mul(d->Q(0), s->Q(0), &env->sse_status),
  1763 + &env->sse_status);
  1764 + if (mask & (1 << 5))
  1765 + iresult = float64_add(iresult,
  1766 + float64_mul(d->Q(1), s->Q(1), &env->sse_status),
  1767 + &env->sse_status);
  1768 + d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/;
  1769 + d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/;
  1770 +}
  1771 +
  1772 +void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset)
  1773 +{
  1774 + int s0 = (offset & 3) << 2;
  1775 + int d0 = (offset & 4) << 0;
  1776 + int i;
  1777 + Reg r;
  1778 +
  1779 + for (i = 0; i < 8; i++, d0++) {
  1780 + r.W(i) = 0;
  1781 + r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
  1782 + r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
  1783 + r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
  1784 + r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
  1785 + }
  1786 +
  1787 + *d = r;
  1788 +}
  1789 +
  1790 +/* SSE4.2 op helpers */
  1791 +/* it's unclear whether signed or unsigned */
  1792 +#define FCMPGTQ(d, s) d > s ? -1 : 0
  1793 +SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
  1794 +
  1795 +static inline int pcmp_elen(int reg, uint32_t ctrl)
  1796 +{
  1797 + int val;
  1798 +
  1799 + /* Presence of REX.W is indicated by a bit higher than 7 set */
  1800 + if (ctrl >> 8)
  1801 + val = abs1((int64_t) env->regs[reg]);
  1802 + else
  1803 + val = abs1((int32_t) env->regs[reg]);
  1804 +
  1805 + if (ctrl & 1) {
  1806 + if (val > 8)
  1807 + return 8;
  1808 + } else
  1809 + if (val > 16)
  1810 + return 16;
  1811 +
  1812 + return val;
  1813 +}
  1814 +
  1815 +static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
  1816 +{
  1817 + int val = 0;
  1818 +
  1819 + if (ctrl & 1) {
  1820 + while (val < 8 && r->W(val))
  1821 + val++;
  1822 + } else
  1823 + while (val < 16 && r->B(val))
  1824 + val++;
  1825 +
  1826 + return val;
  1827 +}
  1828 +
  1829 +static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
  1830 +{
  1831 + switch ((ctrl >> 0) & 3) {
  1832 + case 0:
  1833 + return r->B(i);
  1834 + case 1:
  1835 + return r->W(i);
  1836 + case 2:
  1837 + return (int8_t) r->B(i);
  1838 + case 3:
  1839 + default:
  1840 + return (int16_t) r->W(i);
  1841 + }
  1842 +}
  1843 +
  1844 +static inline unsigned pcmpxstrx(Reg *d, Reg *s,
  1845 + int8_t ctrl, int valids, int validd)
  1846 +{
  1847 + unsigned int res = 0;
  1848 + int v;
  1849 + int j, i;
  1850 + int upper = (ctrl & 1) ? 7 : 15;
  1851 +
  1852 + valids--;
  1853 + validd--;
  1854 +
  1855 + CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
  1856 +
  1857 + switch ((ctrl >> 2) & 3) {
  1858 + case 0:
  1859 + for (j = valids; j >= 0; j--) {
  1860 + res <<= 1;
  1861 + v = pcmp_val(s, ctrl, j);
  1862 + for (i = validd; i >= 0; i--)
  1863 + res |= (v == pcmp_val(d, ctrl, i));
  1864 + }
  1865 + break;
  1866 + case 1:
  1867 + for (j = valids; j >= 0; j--) {
  1868 + res <<= 1;
  1869 + v = pcmp_val(s, ctrl, j);
  1870 + for (i = ((validd - 1) | 1); i >= 0; i -= 2)
  1871 + res |= (pcmp_val(d, ctrl, i - 0) <= v &&
  1872 + pcmp_val(d, ctrl, i - 1) >= v);
  1873 + }
  1874 + break;
  1875 + case 2:
  1876 + res = (2 << (upper - MAX(valids, validd))) - 1;
  1877 + res <<= MAX(valids, validd) - MIN(valids, validd);
  1878 + for (i = MIN(valids, validd); i >= 0; i--) {
  1879 + res <<= 1;
  1880 + v = pcmp_val(s, ctrl, i);
  1881 + res |= (v == pcmp_val(d, ctrl, i));
  1882 + }
  1883 + break;
  1884 + case 3:
  1885 + for (j = valids - validd; j >= 0; j--) {
  1886 + res <<= 1;
  1887 + res |= 1;
  1888 + for (i = MIN(upper - j, validd); i >= 0; i--)
  1889 + res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
  1890 + }
  1891 + break;
  1892 + }
  1893 +
  1894 + switch ((ctrl >> 4) & 3) {
  1895 + case 1:
  1896 + res ^= (2 << upper) - 1;
  1897 + break;
  1898 + case 3:
  1899 + res ^= (2 << valids) - 1;
  1900 + break;
  1901 + }
  1902 +
  1903 + if (res)
  1904 + CC_SRC |= CC_C;
  1905 + if (res & 1)
  1906 + CC_SRC |= CC_O;
  1907 +
  1908 + return res;
  1909 +}
  1910 +
  1911 +static inline int rffs1(unsigned int val)
  1912 +{
  1913 + int ret = 1, hi;
  1914 +
  1915 + for (hi = sizeof(val) * 4; hi; hi /= 2)
  1916 + if (val >> hi) {
  1917 + val >>= hi;
  1918 + ret += hi;
  1919 + }
  1920 +
  1921 + return ret;
  1922 +}
  1923 +
  1924 +static inline int ffs1(unsigned int val)
  1925 +{
  1926 + int ret = 1, hi;
  1927 +
  1928 + for (hi = sizeof(val) * 4; hi; hi /= 2)
  1929 + if (val << hi) {
  1930 + val <<= hi;
  1931 + ret += hi;
  1932 + }
  1933 +
  1934 + return ret;
  1935 +}
  1936 +
  1937 +void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
  1938 +{
  1939 + unsigned int res = pcmpxstrx(d, s, ctrl,
  1940 + pcmp_elen(R_EDX, ctrl),
  1941 + pcmp_elen(R_EAX, ctrl));
  1942 +
  1943 + if (res)
  1944 + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
  1945 + else
  1946 + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
  1947 +}
  1948 +
  1949 +void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
  1950 +{
  1951 + int i;
  1952 + unsigned int res = pcmpxstrx(d, s, ctrl,
  1953 + pcmp_elen(R_EDX, ctrl),
  1954 + pcmp_elen(R_EAX, ctrl));
  1955 +
  1956 + if ((ctrl >> 6) & 1) {
  1957 + if (ctrl & 1)
  1958 + for (i = 0; i <= 8; i--, res >>= 1)
  1959 + d->W(i) = (res & 1) ? ~0 : 0;
  1960 + else
  1961 + for (i = 0; i <= 16; i--, res >>= 1)
  1962 + d->B(i) = (res & 1) ? ~0 : 0;
  1963 + } else {
  1964 + d->Q(1) = 0;
  1965 + d->Q(0) = res;
  1966 + }
  1967 +}
  1968 +
  1969 +void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
  1970 +{
  1971 + unsigned int res = pcmpxstrx(d, s, ctrl,
  1972 + pcmp_ilen(s, ctrl),
  1973 + pcmp_ilen(d, ctrl));
  1974 +
  1975 + if (res)
  1976 + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
  1977 + else
  1978 + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
  1979 +}
  1980 +
  1981 +void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
  1982 +{
  1983 + int i;
  1984 + unsigned int res = pcmpxstrx(d, s, ctrl,
  1985 + pcmp_ilen(s, ctrl),
  1986 + pcmp_ilen(d, ctrl));
  1987 +
  1988 + if ((ctrl >> 6) & 1) {
  1989 + if (ctrl & 1)
  1990 + for (i = 0; i <= 8; i--, res >>= 1)
  1991 + d->W(i) = (res & 1) ? ~0 : 0;
  1992 + else
  1993 + for (i = 0; i <= 16; i--, res >>= 1)
  1994 + d->B(i) = (res & 1) ? ~0 : 0;
  1995 + } else {
  1996 + d->Q(1) = 0;
  1997 + d->Q(0) = res;
  1998 + }
  1999 +}
  2000 +
  2001 +#define CRCPOLY 0x1edc6f41
  2002 +#define CRCPOLY_BITREV 0x82f63b78
  2003 +target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
  2004 +{
  2005 + target_ulong crc = (msg & ((target_ulong) -1 >>
  2006 + (TARGET_LONG_BITS - len))) ^ crc1;
  2007 +
  2008 + while (len--)
  2009 + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
  2010 +
  2011 + return crc;
  2012 +}
  2013 +
  2014 +#define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
  2015 +#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i))
  2016 +target_ulong helper_popcnt(target_ulong n, uint32_t type)
  2017 +{
  2018 + CC_SRC = n ? 0 : CC_Z;
  2019 +
  2020 + n = POPCOUNT(n, 0);
  2021 + n = POPCOUNT(n, 1);
  2022 + n = POPCOUNT(n, 2);
  2023 + n = POPCOUNT(n, 3);
  2024 + if (type == 1)
  2025 + return n & 0xff;
  2026 +
  2027 + n = POPCOUNT(n, 4);
  2028 +#ifndef TARGET_X86_64
  2029 + return n;
  2030 +#else
  2031 + if (type == 2)
  2032 + return n & 0xff;
  2033 +
  2034 + return POPCOUNT(n, 5);
  2035 +#endif
  2036 +}
  2037 +#endif
  2038 +
1423 2039 #undef SHIFT
1424 2040 #undef XMM_ONLY
1425 2041 #undef Reg
... ...
target-i386/ops_sse_header.h
1 1 /*
2   - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support
  2 + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3 3 *
4 4 * Copyright (c) 2005 Fabrice Bellard
5 5 *
... ... @@ -269,6 +269,61 @@ DEF_HELPER(void, glue(helper_psignw, SUFFIX), (Reg *d, Reg *s))
269 269 DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s))
270 270 DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, int32_t shift))
271 271  
  272 +/* SSE4.1 op helpers */
  273 +#if SHIFT == 1
  274 +DEF_HELPER(void, glue(helper_pblendvb, SUFFIX), (Reg *d, Reg *s))
  275 +DEF_HELPER(void, glue(helper_blendvps, SUFFIX), (Reg *d, Reg *s))
  276 +DEF_HELPER(void, glue(helper_blendvpd, SUFFIX), (Reg *d, Reg *s))
  277 +DEF_HELPER(void, glue(helper_ptest, SUFFIX), (Reg *d, Reg *s))
  278 +DEF_HELPER(void, glue(helper_pmovsxbw, SUFFIX), (Reg *d, Reg *s))
  279 +DEF_HELPER(void, glue(helper_pmovsxbd, SUFFIX), (Reg *d, Reg *s))
  280 +DEF_HELPER(void, glue(helper_pmovsxbq, SUFFIX), (Reg *d, Reg *s))
  281 +DEF_HELPER(void, glue(helper_pmovsxwd, SUFFIX), (Reg *d, Reg *s))
  282 +DEF_HELPER(void, glue(helper_pmovsxwq, SUFFIX), (Reg *d, Reg *s))
  283 +DEF_HELPER(void, glue(helper_pmovsxdq, SUFFIX), (Reg *d, Reg *s))
  284 +DEF_HELPER(void, glue(helper_pmovzxbw, SUFFIX), (Reg *d, Reg *s))
  285 +DEF_HELPER(void, glue(helper_pmovzxbd, SUFFIX), (Reg *d, Reg *s))
  286 +DEF_HELPER(void, glue(helper_pmovzxbq, SUFFIX), (Reg *d, Reg *s))
  287 +DEF_HELPER(void, glue(helper_pmovzxwd, SUFFIX), (Reg *d, Reg *s))
  288 +DEF_HELPER(void, glue(helper_pmovzxwq, SUFFIX), (Reg *d, Reg *s))
  289 +DEF_HELPER(void, glue(helper_pmovzxdq, SUFFIX), (Reg *d, Reg *s))
  290 +DEF_HELPER(void, glue(helper_pmuldq, SUFFIX), (Reg *d, Reg *s))
  291 +DEF_HELPER(void, glue(helper_pcmpeqq, SUFFIX), (Reg *d, Reg *s))
  292 +DEF_HELPER(void, glue(helper_packusdw, SUFFIX), (Reg *d, Reg *s))
  293 +DEF_HELPER(void, glue(helper_pminsb, SUFFIX), (Reg *d, Reg *s))
  294 +DEF_HELPER(void, glue(helper_pminsd, SUFFIX), (Reg *d, Reg *s))
  295 +DEF_HELPER(void, glue(helper_pminuw, SUFFIX), (Reg *d, Reg *s))
  296 +DEF_HELPER(void, glue(helper_pminud, SUFFIX), (Reg *d, Reg *s))
  297 +DEF_HELPER(void, glue(helper_pmaxsb, SUFFIX), (Reg *d, Reg *s))
  298 +DEF_HELPER(void, glue(helper_pmaxsd, SUFFIX), (Reg *d, Reg *s))
  299 +DEF_HELPER(void, glue(helper_pmaxuw, SUFFIX), (Reg *d, Reg *s))
  300 +DEF_HELPER(void, glue(helper_pmaxud, SUFFIX), (Reg *d, Reg *s))
  301 +DEF_HELPER(void, glue(helper_pmulld, SUFFIX), (Reg *d, Reg *s))
  302 +DEF_HELPER(void, glue(helper_phminposuw, SUFFIX), (Reg *d, Reg *s))
  303 +DEF_HELPER(void, glue(helper_roundps, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
  304 +DEF_HELPER(void, glue(helper_roundpd, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
  305 +DEF_HELPER(void, glue(helper_roundss, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
  306 +DEF_HELPER(void, glue(helper_roundsd, SUFFIX), (Reg *d, Reg *s, uint32_t mode))
  307 +DEF_HELPER(void, glue(helper_blendps, SUFFIX), (Reg *d, Reg *s, uint32_t imm))
  308 +DEF_HELPER(void, glue(helper_blendpd, SUFFIX), (Reg *d, Reg *s, uint32_t imm))
  309 +DEF_HELPER(void, glue(helper_pblendw, SUFFIX), (Reg *d, Reg *s, uint32_t imm))
  310 +DEF_HELPER(void, glue(helper_dpps, SUFFIX), (Reg *d, Reg *s, uint32_t mask))
  311 +DEF_HELPER(void, glue(helper_dppd, SUFFIX), (Reg *d, Reg *s, uint32_t mask))
  312 +DEF_HELPER(void, glue(helper_mpsadbw, SUFFIX), (Reg *d, Reg *s, uint32_t off))
  313 +#endif
  314 +
  315 +/* SSE4.2 op helpers */
  316 +#if SHIFT == 1
  317 +DEF_HELPER(void, glue(helper_pcmpgtq, SUFFIX), (Reg *d, Reg *s))
  318 +DEF_HELPER(void, glue(helper_pcmpestri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
  319 +DEF_HELPER(void, glue(helper_pcmpestrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
  320 +DEF_HELPER(void, glue(helper_pcmpistri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
  321 +DEF_HELPER(void, glue(helper_pcmpistrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl))
  322 +DEF_HELPER(target_ulong, helper_crc32,
  323 + (uint32_t crc1, target_ulong msg, uint32_t len))
  324 +DEF_HELPER(target_ulong, helper_popcnt, (target_ulong n, uint32_t type))
  325 +#endif
  326 +
272 327 #undef SHIFT
273 328 #undef Reg
274 329 #undef SUFFIX
... ...
target-i386/translate.c
... ... @@ -2140,7 +2140,7 @@ static void gen_add_A0_ds_seg(DisasContext *s)
2140 2140 }
2141 2141 }
2142 2142  
2143   -/* generate modrm memory load or store of 'reg'. TMP0 is used if reg !=
  2143 +/* generate modrm memory load or store of 'reg'. TMP0 is used if reg ==
2144 2144 OR_TMP0 */
2145 2145 static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store)
2146 2146 {
... ... @@ -2770,8 +2770,8 @@ static void *sse_op_table1[256][4] = {
2770 2770 [0xc2] = SSE_FOP(cmpeq),
2771 2771 [0xc6] = { helper_shufps, helper_shufpd },
2772 2772  
2773   - [0x38] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */
2774   - [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */
  2773 + [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
  2774 + [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
2775 2775  
2776 2776 /* MMX ops and their SSE extensions */
2777 2777 [0x60] = MMX_OP2(punpcklbw),
... ... @@ -2924,26 +2924,85 @@ static void *sse_op_table5[256] = {
2924 2924 [0xbf] = helper_pavgb_mmx /* pavgusb */
2925 2925 };
2926 2926  
2927   -static void *sse_op_table6[256][2] = {
2928   - [0x00] = MMX_OP2(pshufb),
2929   - [0x01] = MMX_OP2(phaddw),
2930   - [0x02] = MMX_OP2(phaddd),
2931   - [0x03] = MMX_OP2(phaddsw),
2932   - [0x04] = MMX_OP2(pmaddubsw),
2933   - [0x05] = MMX_OP2(phsubw),
2934   - [0x06] = MMX_OP2(phsubd),
2935   - [0x07] = MMX_OP2(phsubsw),
2936   - [0x08] = MMX_OP2(psignb),
2937   - [0x09] = MMX_OP2(psignw),
2938   - [0x0a] = MMX_OP2(psignd),
2939   - [0x0b] = MMX_OP2(pmulhrsw),
2940   - [0x1c] = MMX_OP2(pabsb),
2941   - [0x1d] = MMX_OP2(pabsw),
2942   - [0x1e] = MMX_OP2(pabsd),
  2927 +struct sse_op_helper_s {
  2928 + void *op[2]; uint32_t ext_mask;
  2929 +};
  2930 +#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 }
  2931 +#define SSE41_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
  2932 +#define SSE42_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
  2933 +#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
  2934 +static struct sse_op_helper_s sse_op_table6[256] = {
  2935 + [0x00] = SSSE3_OP(pshufb),
  2936 + [0x01] = SSSE3_OP(phaddw),
  2937 + [0x02] = SSSE3_OP(phaddd),
  2938 + [0x03] = SSSE3_OP(phaddsw),
  2939 + [0x04] = SSSE3_OP(pmaddubsw),
  2940 + [0x05] = SSSE3_OP(phsubw),
  2941 + [0x06] = SSSE3_OP(phsubd),
  2942 + [0x07] = SSSE3_OP(phsubsw),
  2943 + [0x08] = SSSE3_OP(psignb),
  2944 + [0x09] = SSSE3_OP(psignw),
  2945 + [0x0a] = SSSE3_OP(psignd),
  2946 + [0x0b] = SSSE3_OP(pmulhrsw),
  2947 + [0x10] = SSE41_OP(pblendvb),
  2948 + [0x14] = SSE41_OP(blendvps),
  2949 + [0x15] = SSE41_OP(blendvpd),
  2950 + [0x17] = SSE41_OP(ptest),
  2951 + [0x1c] = SSSE3_OP(pabsb),
  2952 + [0x1d] = SSSE3_OP(pabsw),
  2953 + [0x1e] = SSSE3_OP(pabsd),
  2954 + [0x20] = SSE41_OP(pmovsxbw),
  2955 + [0x21] = SSE41_OP(pmovsxbd),
  2956 + [0x22] = SSE41_OP(pmovsxbq),
  2957 + [0x23] = SSE41_OP(pmovsxwd),
  2958 + [0x24] = SSE41_OP(pmovsxwq),
  2959 + [0x25] = SSE41_OP(pmovsxdq),
  2960 + [0x28] = SSE41_OP(pmuldq),
  2961 + [0x29] = SSE41_OP(pcmpeqq),
  2962 + [0x2a] = SSE41_SPECIAL, /* movntqda */
  2963 + [0x2b] = SSE41_OP(packusdw),
  2964 + [0x30] = SSE41_OP(pmovzxbw),
  2965 + [0x31] = SSE41_OP(pmovzxbd),
  2966 + [0x32] = SSE41_OP(pmovzxbq),
  2967 + [0x33] = SSE41_OP(pmovzxwd),
  2968 + [0x34] = SSE41_OP(pmovzxwq),
  2969 + [0x35] = SSE41_OP(pmovzxdq),
  2970 + [0x37] = SSE42_OP(pcmpgtq),
  2971 + [0x38] = SSE41_OP(pminsb),
  2972 + [0x39] = SSE41_OP(pminsd),
  2973 + [0x3a] = SSE41_OP(pminuw),
  2974 + [0x3b] = SSE41_OP(pminud),
  2975 + [0x3c] = SSE41_OP(pmaxsb),
  2976 + [0x3d] = SSE41_OP(pmaxsd),
  2977 + [0x3e] = SSE41_OP(pmaxuw),
  2978 + [0x3f] = SSE41_OP(pmaxud),
  2979 + [0x40] = SSE41_OP(pmulld),
  2980 + [0x41] = SSE41_OP(phminposuw),
2943 2981 };
2944 2982  
2945   -static void *sse_op_table7[256][2] = {
2946   - [0x0f] = MMX_OP2(palignr),
  2983 +static struct sse_op_helper_s sse_op_table7[256] = {
  2984 + [0x08] = SSE41_OP(roundps),
  2985 + [0x09] = SSE41_OP(roundpd),
  2986 + [0x0a] = SSE41_OP(roundss),
  2987 + [0x0b] = SSE41_OP(roundsd),
  2988 + [0x0c] = SSE41_OP(blendps),
  2989 + [0x0d] = SSE41_OP(blendpd),
  2990 + [0x0e] = SSE41_OP(pblendw),
  2991 + [0x0f] = SSSE3_OP(palignr),
  2992 + [0x14] = SSE41_SPECIAL, /* pextrb */
  2993 + [0x15] = SSE41_SPECIAL, /* pextrw */
  2994 + [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */
  2995 + [0x17] = SSE41_SPECIAL, /* extractps */
  2996 + [0x20] = SSE41_SPECIAL, /* pinsrb */
  2997 + [0x21] = SSE41_SPECIAL, /* insertps */
  2998 + [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */
  2999 + [0x40] = SSE41_OP(dpps),
  3000 + [0x41] = SSE41_OP(dppd),
  3001 + [0x42] = SSE41_OP(mpsadbw),
  3002 + [0x60] = SSE42_OP(pcmpestrm),
  3003 + [0x61] = SSE42_OP(pcmpestri),
  3004 + [0x62] = SSE42_OP(pcmpistrm),
  3005 + [0x63] = SSE42_OP(pcmpistri),
2947 3006 };
2948 3007  
2949 3008 static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
... ... @@ -3511,18 +3570,20 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
3511 3570 break;
3512 3571 case 0x038:
3513 3572 case 0x138:
3514   - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3))
3515   - goto illegal_op;
3516   -
3517 3573 b = modrm;
3518 3574 modrm = ldub_code(s->pc++);
3519 3575 rm = modrm & 7;
3520 3576 reg = ((modrm >> 3) & 7) | rex_r;
3521 3577 mod = (modrm >> 6) & 3;
3522 3578  
3523   - sse_op2 = sse_op_table6[b][b1];
  3579 + if (s->prefix & PREFIX_REPNZ)
  3580 + goto crc32;
  3581 +
  3582 + sse_op2 = sse_op_table6[b].op[b1];
3524 3583 if (!sse_op2)
3525 3584 goto illegal_op;
  3585 + if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask))
  3586 + goto illegal_op;
3526 3587  
3527 3588 if (b1) {
3528 3589 op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
... ... @@ -3531,7 +3592,32 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
3531 3592 } else {
3532 3593 op2_offset = offsetof(CPUX86State,xmm_t0);
3533 3594 gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
3534   - gen_ldo_env_A0(s->mem_index, op2_offset);
  3595 + switch (b) {
  3596 + case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */
  3597 + case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */
  3598 + case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */
  3599 + gen_ldq_env_A0(s->mem_index, op2_offset +
  3600 + offsetof(XMMReg, XMM_Q(0)));
  3601 + break;
  3602 + case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */
  3603 + case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */
  3604 + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0,
  3605 + (s->mem_index >> 2) - 1);
  3606 + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset +
  3607 + offsetof(XMMReg, XMM_L(0)));
  3608 + break;
  3609 + case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
  3610 + tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0,
  3611 + (s->mem_index >> 2) - 1);
  3612 + tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset +
  3613 + offsetof(XMMReg, XMM_W(0)));
  3614 + break;
  3615 + case 0x2a: /* movntqda */
  3616 + gen_ldo_env_A0(s->mem_index, op1_offset);
  3617 + return;
  3618 + default:
  3619 + gen_ldo_env_A0(s->mem_index, op2_offset);
  3620 + }
3535 3621 }
3536 3622 } else {
3537 3623 op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
... ... @@ -3543,24 +3629,177 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
3543 3629 gen_ldq_env_A0(s->mem_index, op2_offset);
3544 3630 }
3545 3631 }
  3632 + if (sse_op2 == SSE_SPECIAL)
  3633 + goto illegal_op;
  3634 +
3546 3635 tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
3547 3636 tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
3548 3637 tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1);
  3638 +
  3639 + if (b == 0x17)
  3640 + s->cc_op = CC_OP_EFLAGS;
3549 3641 break;
3550   - case 0x03a:
3551   - case 0x13a:
3552   - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3))
  3642 + case 0x338: /* crc32 */
  3643 + crc32:
  3644 + b = modrm;
  3645 + modrm = ldub_code(s->pc++);
  3646 + reg = ((modrm >> 3) & 7) | rex_r;
  3647 +
  3648 + if (b != 0xf0 && b != 0xf1)
  3649 + goto illegal_op;
  3650 + if (!(s->cpuid_ext_features & CPUID_EXT_SSE42))
3553 3651 goto illegal_op;
3554 3652  
  3653 + if (b == 0xf0)
  3654 + ot = OT_BYTE;
  3655 + else if (b == 0xf1 && s->dflag != 2)
  3656 + if (s->prefix & PREFIX_DATA)
  3657 + ot = OT_WORD;
  3658 + else
  3659 + ot = OT_LONG;
  3660 + else
  3661 + ot = OT_QUAD;
  3662 +
  3663 + gen_op_mov_TN_reg(OT_LONG, 0, reg);
  3664 + tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
  3665 + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
  3666 + tcg_gen_helper_1_3(helper_crc32, cpu_T[0], cpu_tmp2_i32,
  3667 + cpu_T[0], tcg_const_i32(8 << ot));
  3668 +
  3669 + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
  3670 + gen_op_mov_reg_T0(ot, reg);
  3671 + break;
  3672 + case 0x03a:
  3673 + case 0x13a:
3555 3674 b = modrm;
3556 3675 modrm = ldub_code(s->pc++);
3557 3676 rm = modrm & 7;
3558 3677 reg = ((modrm >> 3) & 7) | rex_r;
3559 3678 mod = (modrm >> 6) & 3;
3560 3679  
3561   - sse_op2 = sse_op_table7[b][b1];
  3680 + sse_op2 = sse_op_table7[b].op[b1];
3562 3681 if (!sse_op2)
3563 3682 goto illegal_op;
  3683 + if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask))
  3684 + goto illegal_op;
  3685 +
  3686 + if (sse_op2 == SSE_SPECIAL) {
  3687 + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
  3688 + rm = (modrm & 7) | REX_B(s);
  3689 + if (mod != 3)
  3690 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  3691 + reg = ((modrm >> 3) & 7) | rex_r;
  3692 + val = ldub_code(s->pc++);
  3693 + switch (b) {
  3694 + case 0x14: /* pextrb */
  3695 + tcg_gen_ld8u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
  3696 + xmm_regs[reg].XMM_B(val & 15)));
  3697 + if (mod == 3)
  3698 + gen_op_mov_reg_T0(ot, rm);
  3699 + else
  3700 + tcg_gen_qemu_st8(cpu_T[0], cpu_A0,
  3701 + (s->mem_index >> 2) - 1);
  3702 + break;
  3703 + case 0x15: /* pextrw */
  3704 + tcg_gen_ld16u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
  3705 + xmm_regs[reg].XMM_W(val & 7)));
  3706 + if (mod == 3)
  3707 + gen_op_mov_reg_T0(ot, rm);
  3708 + else
  3709 + tcg_gen_qemu_st16(cpu_T[0], cpu_A0,
  3710 + (s->mem_index >> 2) - 1);
  3711 + break;
  3712 + case 0x16:
  3713 + if (ot == OT_LONG) { /* pextrd */
  3714 + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env,
  3715 + offsetof(CPUX86State,
  3716 + xmm_regs[reg].XMM_L(val & 3)));
  3717 + if (mod == 3)
  3718 + gen_op_mov_reg_v(ot, rm, cpu_tmp2_i32);
  3719 + else
  3720 + tcg_gen_qemu_st32(cpu_tmp2_i32, cpu_A0,
  3721 + (s->mem_index >> 2) - 1);
  3722 + } else { /* pextrq */
  3723 + tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env,
  3724 + offsetof(CPUX86State,
  3725 + xmm_regs[reg].XMM_Q(val & 1)));
  3726 + if (mod == 3)
  3727 + gen_op_mov_reg_v(ot, rm, cpu_tmp1_i64);
  3728 + else
  3729 + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0,
  3730 + (s->mem_index >> 2) - 1);
  3731 + }
  3732 + break;
  3733 + case 0x17: /* extractps */
  3734 + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
  3735 + xmm_regs[reg].XMM_L(val & 3)));
  3736 + if (mod == 3)
  3737 + gen_op_mov_reg_T0(ot, rm);
  3738 + else
  3739 + tcg_gen_qemu_st32(cpu_T[0], cpu_A0,
  3740 + (s->mem_index >> 2) - 1);
  3741 + break;
  3742 + case 0x20: /* pinsrb */
  3743 + if (mod == 3)
  3744 + gen_op_mov_TN_reg(OT_LONG, 0, rm);
  3745 + else
  3746 + tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0,
  3747 + (s->mem_index >> 2) - 1);
  3748 + tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
  3749 + xmm_regs[reg].XMM_B(val & 15)));
  3750 + break;
  3751 + case 0x21: /* insertps */
  3752 + if (mod == 3)
  3753 + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env,
  3754 + offsetof(CPUX86State,xmm_regs[rm]
  3755 + .XMM_L((val >> 6) & 3)));
  3756 + else
  3757 + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0,
  3758 + (s->mem_index >> 2) - 1);
  3759 + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env,
  3760 + offsetof(CPUX86State,xmm_regs[reg]
  3761 + .XMM_L((val >> 4) & 3)));
  3762 + if ((val >> 0) & 1)
  3763 + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
  3764 + cpu_env, offsetof(CPUX86State,
  3765 + xmm_regs[reg].XMM_L(0)));
  3766 + if ((val >> 1) & 1)
  3767 + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
  3768 + cpu_env, offsetof(CPUX86State,
  3769 + xmm_regs[reg].XMM_L(1)));
  3770 + if ((val >> 2) & 1)
  3771 + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
  3772 + cpu_env, offsetof(CPUX86State,
  3773 + xmm_regs[reg].XMM_L(2)));
  3774 + if ((val >> 3) & 1)
  3775 + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/),
  3776 + cpu_env, offsetof(CPUX86State,
  3777 + xmm_regs[reg].XMM_L(3)));
  3778 + break;
  3779 + case 0x22:
  3780 + if (ot == OT_LONG) { /* pinsrd */
  3781 + if (mod == 3)
  3782 + gen_op_mov_v_reg(ot, cpu_tmp2_i32, rm);
  3783 + else
  3784 + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0,
  3785 + (s->mem_index >> 2) - 1);
  3786 + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env,
  3787 + offsetof(CPUX86State,
  3788 + xmm_regs[reg].XMM_L(val & 3)));
  3789 + } else { /* pinsrq */
  3790 + if (mod == 3)
  3791 + gen_op_mov_v_reg(ot, cpu_tmp1_i64, rm);
  3792 + else
  3793 + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0,
  3794 + (s->mem_index >> 2) - 1);
  3795 + tcg_gen_st_i64(cpu_tmp1_i64, cpu_env,
  3796 + offsetof(CPUX86State,
  3797 + xmm_regs[reg].XMM_Q(val & 1)));
  3798 + }
  3799 + break;
  3800 + }
  3801 + return;
  3802 + }
3564 3803  
3565 3804 if (b1) {
3566 3805 op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
... ... @@ -3583,6 +3822,14 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
3583 3822 }
3584 3823 val = ldub_code(s->pc++);
3585 3824  
  3825 + if ((b & 0xfc) == 0x60) { /* pcmpXstrX */
  3826 + s->cc_op = CC_OP_EFLAGS;
  3827 +
  3828 + if (s->dflag == 2)
  3829 + /* The helper must use entire 64-bit gp registers */
  3830 + val |= 1 << 8;
  3831 + }
  3832 +
3586 3833 tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
3587 3834 tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
3588 3835 tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
... ... @@ -7094,7 +7341,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
7094 7341 gen_eob(s);
7095 7342 }
7096 7343 break;
7097   - /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3 support */
  7344 + /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */
7098 7345 case 0x1c3: /* MOVNTI reg, mem */
7099 7346 if (!(s->cpuid_features & CPUID_SSE2))
7100 7347 goto illegal_op;
... ... @@ -7202,6 +7449,28 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
7202 7449 tcg_gen_helper_0_0(helper_rsm);
7203 7450 gen_eob(s);
7204 7451 break;
  7452 + case 0x1b8: /* SSE4.2 popcnt */
  7453 + if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) !=
  7454 + PREFIX_REPZ)
  7455 + goto illegal_op;
  7456 + if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT))
  7457 + goto illegal_op;
  7458 +
  7459 + modrm = ldub_code(s->pc++);
  7460 + reg = ((modrm >> 3) & 7);
  7461 +
  7462 + if (s->prefix & PREFIX_DATA)
  7463 + ot = OT_WORD;
  7464 + else if (s->dflag != 2)
  7465 + ot = OT_LONG;
  7466 + else
  7467 + ot = OT_QUAD;
  7468 +
  7469 + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
  7470 + tcg_gen_helper_1_2(helper_popcnt,
  7471 + cpu_T[0], cpu_T[0], tcg_const_i32(ot));
  7472 + gen_op_mov_reg_T0(ot, reg);
  7473 + break;
7205 7474 case 0x10e ... 0x10f:
7206 7475 /* 3DNow! instructions, ignore prefixes */
7207 7476 s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA);
... ...
tests/test-i386-ssse3.c
1 1 /* See if various MMX/SSE SSSE3 instructions give expected results */
2 2 #include <stdio.h>
3 3 #include <string.h>
  4 +#include <stdint.h>
4 5  
5 6 int main(int argc, char *argv[]) {
6 7 char hello[16];
... ... @@ -9,9 +10,11 @@ int main(int argc, char *argv[]) {
9 10  
10 11 uint64_t a = 0x0000000000090007;
11 12 uint64_t b = 0x0000000000000000;
  13 + uint32_t c;
  14 + uint16_t d;
12 15  
13   - const char c[16] = "LLOaaaaaaaaaaaaa";
14   - const char d[16] = "aaaaaaaaaaaaaaHE";
  16 + const char e[16] = "LLOaaaaaaaaaaaaa";
  17 + const char f[16] = "aaaaaaaaaaaaaaHE";
15 18  
16 19 /* pshufb mm1/xmm1, mm2/xmm2 */
17 20 asm volatile ("movq (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1");
... ... @@ -33,10 +36,22 @@ int main(int argc, char *argv[]) {
33 36 printf("%i - %i = %i\n", 9, 7, -(int16_t) a);
34 37  
35 38 /* palignr mm1/xmm1, m64/m128, imm8 */
36   - asm volatile ("movdqa (%0), %%xmm0" : : "r" (c) : "xmm0");
37   - asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (d));
  39 + asm volatile ("movdqa (%0), %%xmm0" : : "r" (e) : "xmm0");
  40 + asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (f));
38 41 asm volatile ("movdqa %%xmm0, (%0)" : : "r" (hello));
39 42 printf("%5.5s\n", hello);
40 43  
  44 +#if 1 /* SSE4 */
  45 + /* popcnt r64, r/m64 */
  46 + asm volatile ("movq $0x8421000010009c63, %%rax" : : : "rax");
  47 + asm volatile ("popcnt %%ax, %%dx" : : : "dx");
  48 + asm volatile ("popcnt %%eax, %%ecx" : : : "ecx");
  49 + asm volatile ("popcnt %rax, %rax");
  50 + asm volatile ("movq %%rax, %0" : "=m" (a));
  51 + asm volatile ("movl %%ecx, %0" : "=m" (c));
  52 + asm volatile ("movw %%dx, %0" : "=m" (d));
  53 + printf("%i = %i\n%i = %i = %i\n", 13, (int) a, 9, c, d + 1);
  54 +#endif
  55 +
41 56 return 0;
42 57 }
... ...