Commit 222a3336ecbe177da082f9ac20f9614d6d23c721
1 parent
06adb549
Implement SSE4.1, SSE4.2 (x86).
This adds support for CPUID_EXT_SSE41, CPUID_EXT_SSE42, CPUID_EXT_POPCNT extensions. Most instructions haven't been tested yet. git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5411 c046a42c-6fe2-441c-8c8c-71466251a162
Showing
4 changed files
with
992 additions
and
37 deletions
target-i386/ops_sse.h
1 | /* | 1 | /* |
2 | - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support | 2 | + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support |
3 | * | 3 | * |
4 | * Copyright (c) 2005 Fabrice Bellard | 4 | * Copyright (c) 2005 Fabrice Bellard |
5 | + * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> | ||
5 | * | 6 | * |
6 | * This library is free software; you can redistribute it and/or | 7 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public | 8 | * modify it under the terms of the GNU Lesser General Public |
@@ -1420,6 +1421,621 @@ void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift) | @@ -1420,6 +1421,621 @@ void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift) | ||
1420 | *d = r; | 1421 | *d = r; |
1421 | } | 1422 | } |
1422 | 1423 | ||
1424 | +#define XMM0 env->xmm_regs[0] | ||
1425 | + | ||
1426 | +#if SHIFT == 1 | ||
1427 | +#define SSE_HELPER_V(name, elem, num, F)\ | ||
1428 | +void glue(name, SUFFIX) (Reg *d, Reg *s)\ | ||
1429 | +{\ | ||
1430 | + d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\ | ||
1431 | + d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\ | ||
1432 | + if (num > 2) {\ | ||
1433 | + d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\ | ||
1434 | + d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\ | ||
1435 | + if (num > 4) {\ | ||
1436 | + d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\ | ||
1437 | + d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\ | ||
1438 | + d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\ | ||
1439 | + d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\ | ||
1440 | + if (num > 8) {\ | ||
1441 | + d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\ | ||
1442 | + d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\ | ||
1443 | + d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\ | ||
1444 | + d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\ | ||
1445 | + d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\ | ||
1446 | + d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\ | ||
1447 | + d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\ | ||
1448 | + d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\ | ||
1449 | + }\ | ||
1450 | + }\ | ||
1451 | + }\ | ||
1452 | +} | ||
1453 | + | ||
1454 | +#define SSE_HELPER_I(name, elem, num, F)\ | ||
1455 | +void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\ | ||
1456 | +{\ | ||
1457 | + d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\ | ||
1458 | + d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\ | ||
1459 | + if (num > 2) {\ | ||
1460 | + d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\ | ||
1461 | + d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\ | ||
1462 | + if (num > 4) {\ | ||
1463 | + d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\ | ||
1464 | + d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\ | ||
1465 | + d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\ | ||
1466 | + d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\ | ||
1467 | + if (num > 8) {\ | ||
1468 | + d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\ | ||
1469 | + d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\ | ||
1470 | + d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\ | ||
1471 | + d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\ | ||
1472 | + d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\ | ||
1473 | + d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\ | ||
1474 | + d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\ | ||
1475 | + d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\ | ||
1476 | + }\ | ||
1477 | + }\ | ||
1478 | + }\ | ||
1479 | +} | ||
1480 | + | ||
1481 | +/* SSE4.1 op helpers */ | ||
1482 | +#define FBLENDVB(d, s, m) (m & 0x80) ? s : d | ||
1483 | +#define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d | ||
1484 | +#define FBLENDVPD(d, s, m) (m & 0x8000000000000000) ? s : d | ||
1485 | +SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) | ||
1486 | +SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) | ||
1487 | +SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) | ||
1488 | + | ||
1489 | +void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s) | ||
1490 | +{ | ||
1491 | + uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); | ||
1492 | + uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); | ||
1493 | + | ||
1494 | + CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); | ||
1495 | +} | ||
1496 | + | ||
1497 | +#define SSE_HELPER_F(name, elem, num, F)\ | ||
1498 | +void glue(name, SUFFIX) (Reg *d, Reg *s)\ | ||
1499 | +{\ | ||
1500 | + d->elem(0) = F(0);\ | ||
1501 | + d->elem(1) = F(1);\ | ||
1502 | + d->elem(2) = F(2);\ | ||
1503 | + d->elem(3) = F(3);\ | ||
1504 | + if (num > 3) {\ | ||
1505 | + d->elem(4) = F(4);\ | ||
1506 | + d->elem(5) = F(5);\ | ||
1507 | + if (num > 5) {\ | ||
1508 | + d->elem(6) = F(6);\ | ||
1509 | + d->elem(7) = F(7);\ | ||
1510 | + }\ | ||
1511 | + }\ | ||
1512 | +} | ||
1513 | + | ||
1514 | +SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B) | ||
1515 | +SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B) | ||
1516 | +SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B) | ||
1517 | +SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W) | ||
1518 | +SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W) | ||
1519 | +SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L) | ||
1520 | +SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B) | ||
1521 | +SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B) | ||
1522 | +SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B) | ||
1523 | +SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) | ||
1524 | +SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) | ||
1525 | +SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) | ||
1526 | + | ||
1527 | +void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s) | ||
1528 | +{ | ||
1529 | + d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0); | ||
1530 | + d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2); | ||
1531 | +} | ||
1532 | + | ||
1533 | +#define FCMPEQQ(d, s) d == s ? -1 : 0 | ||
1534 | +SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) | ||
1535 | + | ||
1536 | +void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s) | ||
1537 | +{ | ||
1538 | + d->W(0) = satuw((int32_t) d->L(0)); | ||
1539 | + d->W(1) = satuw((int32_t) d->L(1)); | ||
1540 | + d->W(2) = satuw((int32_t) d->L(2)); | ||
1541 | + d->W(3) = satuw((int32_t) d->L(3)); | ||
1542 | + d->W(4) = satuw((int32_t) s->L(0)); | ||
1543 | + d->W(5) = satuw((int32_t) s->L(1)); | ||
1544 | + d->W(6) = satuw((int32_t) s->L(2)); | ||
1545 | + d->W(7) = satuw((int32_t) s->L(3)); | ||
1546 | +} | ||
1547 | + | ||
1548 | +#define FMINSB(d, s) MIN((int8_t) d, (int8_t) s) | ||
1549 | +#define FMINSD(d, s) MIN((int32_t) d, (int32_t) s) | ||
1550 | +#define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s) | ||
1551 | +#define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s) | ||
1552 | +SSE_HELPER_B(helper_pminsb, FMINSB) | ||
1553 | +SSE_HELPER_L(helper_pminsd, FMINSD) | ||
1554 | +SSE_HELPER_W(helper_pminuw, MIN) | ||
1555 | +SSE_HELPER_L(helper_pminud, MIN) | ||
1556 | +SSE_HELPER_B(helper_pmaxsb, FMAXSB) | ||
1557 | +SSE_HELPER_L(helper_pmaxsd, FMAXSD) | ||
1558 | +SSE_HELPER_W(helper_pmaxuw, MAX) | ||
1559 | +SSE_HELPER_L(helper_pmaxud, MAX) | ||
1560 | + | ||
1561 | +#define FMULLD(d, s) (int32_t) d * (int32_t) s | ||
1562 | +SSE_HELPER_L(helper_pmulld, FMULLD) | ||
1563 | + | ||
1564 | +void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s) | ||
1565 | +{ | ||
1566 | + int idx = 0; | ||
1567 | + | ||
1568 | + if (s->W(1) < s->W(idx)) | ||
1569 | + idx = 1; | ||
1570 | + if (s->W(2) < s->W(idx)) | ||
1571 | + idx = 2; | ||
1572 | + if (s->W(3) < s->W(idx)) | ||
1573 | + idx = 3; | ||
1574 | + if (s->W(4) < s->W(idx)) | ||
1575 | + idx = 4; | ||
1576 | + if (s->W(5) < s->W(idx)) | ||
1577 | + idx = 5; | ||
1578 | + if (s->W(6) < s->W(idx)) | ||
1579 | + idx = 6; | ||
1580 | + if (s->W(7) < s->W(idx)) | ||
1581 | + idx = 7; | ||
1582 | + | ||
1583 | + d->Q(1) = 0; | ||
1584 | + d->L(1) = 0; | ||
1585 | + d->W(1) = idx; | ||
1586 | + d->W(0) = s->W(idx); | ||
1587 | +} | ||
1588 | + | ||
1589 | +void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | ||
1590 | +{ | ||
1591 | + signed char prev_rounding_mode; | ||
1592 | + | ||
1593 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | ||
1594 | + if (!(mode & (1 << 2))) | ||
1595 | + switch (mode & 3) { | ||
1596 | + case 0: | ||
1597 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | ||
1598 | + break; | ||
1599 | + case 1: | ||
1600 | + set_float_rounding_mode(float_round_down, &env->sse_status); | ||
1601 | + break; | ||
1602 | + case 2: | ||
1603 | + set_float_rounding_mode(float_round_up, &env->sse_status); | ||
1604 | + break; | ||
1605 | + case 3: | ||
1606 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | ||
1607 | + break; | ||
1608 | + } | ||
1609 | + | ||
1610 | + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); | ||
1611 | + d->L(1) = float64_round_to_int(s->L(1), &env->sse_status); | ||
1612 | + d->L(2) = float64_round_to_int(s->L(2), &env->sse_status); | ||
1613 | + d->L(3) = float64_round_to_int(s->L(3), &env->sse_status); | ||
1614 | + | ||
1615 | +#if 0 /* TODO */ | ||
1616 | + if (mode & (1 << 3)) | ||
1617 | + set_float_exception_flags( | ||
1618 | + get_float_exception_flags(&env->sse_status) & | ||
1619 | + ~float_flag_inexact, | ||
1620 | + &env->sse_status); | ||
1621 | +#endif | ||
1622 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | ||
1623 | +} | ||
1624 | + | ||
1625 | +void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | ||
1626 | +{ | ||
1627 | + signed char prev_rounding_mode; | ||
1628 | + | ||
1629 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | ||
1630 | + if (!(mode & (1 << 2))) | ||
1631 | + switch (mode & 3) { | ||
1632 | + case 0: | ||
1633 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | ||
1634 | + break; | ||
1635 | + case 1: | ||
1636 | + set_float_rounding_mode(float_round_down, &env->sse_status); | ||
1637 | + break; | ||
1638 | + case 2: | ||
1639 | + set_float_rounding_mode(float_round_up, &env->sse_status); | ||
1640 | + break; | ||
1641 | + case 3: | ||
1642 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | ||
1643 | + break; | ||
1644 | + } | ||
1645 | + | ||
1646 | + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); | ||
1647 | + d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status); | ||
1648 | + | ||
1649 | +#if 0 /* TODO */ | ||
1650 | + if (mode & (1 << 3)) | ||
1651 | + set_float_exception_flags( | ||
1652 | + get_float_exception_flags(&env->sse_status) & | ||
1653 | + ~float_flag_inexact, | ||
1654 | + &env->sse_status); | ||
1655 | +#endif | ||
1656 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | ||
1657 | +} | ||
1658 | + | ||
1659 | +void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | ||
1660 | +{ | ||
1661 | + signed char prev_rounding_mode; | ||
1662 | + | ||
1663 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | ||
1664 | + if (!(mode & (1 << 2))) | ||
1665 | + switch (mode & 3) { | ||
1666 | + case 0: | ||
1667 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | ||
1668 | + break; | ||
1669 | + case 1: | ||
1670 | + set_float_rounding_mode(float_round_down, &env->sse_status); | ||
1671 | + break; | ||
1672 | + case 2: | ||
1673 | + set_float_rounding_mode(float_round_up, &env->sse_status); | ||
1674 | + break; | ||
1675 | + case 3: | ||
1676 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | ||
1677 | + break; | ||
1678 | + } | ||
1679 | + | ||
1680 | + d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); | ||
1681 | + | ||
1682 | +#if 0 /* TODO */ | ||
1683 | + if (mode & (1 << 3)) | ||
1684 | + set_float_exception_flags( | ||
1685 | + get_float_exception_flags(&env->sse_status) & | ||
1686 | + ~float_flag_inexact, | ||
1687 | + &env->sse_status); | ||
1688 | +#endif | ||
1689 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | ||
1690 | +} | ||
1691 | + | ||
1692 | +void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) | ||
1693 | +{ | ||
1694 | + signed char prev_rounding_mode; | ||
1695 | + | ||
1696 | + prev_rounding_mode = env->sse_status.float_rounding_mode; | ||
1697 | + if (!(mode & (1 << 2))) | ||
1698 | + switch (mode & 3) { | ||
1699 | + case 0: | ||
1700 | + set_float_rounding_mode(float_round_nearest_even, &env->sse_status); | ||
1701 | + break; | ||
1702 | + case 1: | ||
1703 | + set_float_rounding_mode(float_round_down, &env->sse_status); | ||
1704 | + break; | ||
1705 | + case 2: | ||
1706 | + set_float_rounding_mode(float_round_up, &env->sse_status); | ||
1707 | + break; | ||
1708 | + case 3: | ||
1709 | + set_float_rounding_mode(float_round_to_zero, &env->sse_status); | ||
1710 | + break; | ||
1711 | + } | ||
1712 | + | ||
1713 | + d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); | ||
1714 | + | ||
1715 | +#if 0 /* TODO */ | ||
1716 | + if (mode & (1 << 3)) | ||
1717 | + set_float_exception_flags( | ||
1718 | + get_float_exception_flags(&env->sse_status) & | ||
1719 | + ~float_flag_inexact, | ||
1720 | + &env->sse_status); | ||
1721 | +#endif | ||
1722 | + env->sse_status.float_rounding_mode = prev_rounding_mode; | ||
1723 | +} | ||
1724 | + | ||
1725 | +#define FBLENDP(d, s, m) m ? s : d | ||
1726 | +SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) | ||
1727 | +SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) | ||
1728 | +SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) | ||
1729 | + | ||
1730 | +void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask) | ||
1731 | +{ | ||
1732 | + float32 iresult = 0 /*float32_zero*/; | ||
1733 | + | ||
1734 | + if (mask & (1 << 4)) | ||
1735 | + iresult = float32_add(iresult, | ||
1736 | + float32_mul(d->L(0), s->L(0), &env->sse_status), | ||
1737 | + &env->sse_status); | ||
1738 | + if (mask & (1 << 5)) | ||
1739 | + iresult = float32_add(iresult, | ||
1740 | + float32_mul(d->L(1), s->L(1), &env->sse_status), | ||
1741 | + &env->sse_status); | ||
1742 | + if (mask & (1 << 6)) | ||
1743 | + iresult = float32_add(iresult, | ||
1744 | + float32_mul(d->L(2), s->L(2), &env->sse_status), | ||
1745 | + &env->sse_status); | ||
1746 | + if (mask & (1 << 7)) | ||
1747 | + iresult = float32_add(iresult, | ||
1748 | + float32_mul(d->L(3), s->L(3), &env->sse_status), | ||
1749 | + &env->sse_status); | ||
1750 | + d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/; | ||
1751 | + d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/; | ||
1752 | + d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/; | ||
1753 | + d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/; | ||
1754 | +} | ||
1755 | + | ||
1756 | +void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask) | ||
1757 | +{ | ||
1758 | + float64 iresult = 0 /*float64_zero*/; | ||
1759 | + | ||
1760 | + if (mask & (1 << 4)) | ||
1761 | + iresult = float64_add(iresult, | ||
1762 | + float64_mul(d->Q(0), s->Q(0), &env->sse_status), | ||
1763 | + &env->sse_status); | ||
1764 | + if (mask & (1 << 5)) | ||
1765 | + iresult = float64_add(iresult, | ||
1766 | + float64_mul(d->Q(1), s->Q(1), &env->sse_status), | ||
1767 | + &env->sse_status); | ||
1768 | + d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/; | ||
1769 | + d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/; | ||
1770 | +} | ||
1771 | + | ||
1772 | +void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset) | ||
1773 | +{ | ||
1774 | + int s0 = (offset & 3) << 2; | ||
1775 | + int d0 = (offset & 4) << 0; | ||
1776 | + int i; | ||
1777 | + Reg r; | ||
1778 | + | ||
1779 | + for (i = 0; i < 8; i++, d0++) { | ||
1780 | + r.W(i) = 0; | ||
1781 | + r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); | ||
1782 | + r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); | ||
1783 | + r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); | ||
1784 | + r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); | ||
1785 | + } | ||
1786 | + | ||
1787 | + *d = r; | ||
1788 | +} | ||
1789 | + | ||
1790 | +/* SSE4.2 op helpers */ | ||
1791 | +/* it's unclear whether signed or unsigned */ | ||
1792 | +#define FCMPGTQ(d, s) d > s ? -1 : 0 | ||
1793 | +SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) | ||
1794 | + | ||
1795 | +static inline int pcmp_elen(int reg, uint32_t ctrl) | ||
1796 | +{ | ||
1797 | + int val; | ||
1798 | + | ||
1799 | + /* Presence of REX.W is indicated by a bit higher than 7 set */ | ||
1800 | + if (ctrl >> 8) | ||
1801 | + val = abs1((int64_t) env->regs[reg]); | ||
1802 | + else | ||
1803 | + val = abs1((int32_t) env->regs[reg]); | ||
1804 | + | ||
1805 | + if (ctrl & 1) { | ||
1806 | + if (val > 8) | ||
1807 | + return 8; | ||
1808 | + } else | ||
1809 | + if (val > 16) | ||
1810 | + return 16; | ||
1811 | + | ||
1812 | + return val; | ||
1813 | +} | ||
1814 | + | ||
1815 | +static inline int pcmp_ilen(Reg *r, uint8_t ctrl) | ||
1816 | +{ | ||
1817 | + int val = 0; | ||
1818 | + | ||
1819 | + if (ctrl & 1) { | ||
1820 | + while (val < 8 && r->W(val)) | ||
1821 | + val++; | ||
1822 | + } else | ||
1823 | + while (val < 16 && r->B(val)) | ||
1824 | + val++; | ||
1825 | + | ||
1826 | + return val; | ||
1827 | +} | ||
1828 | + | ||
1829 | +static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) | ||
1830 | +{ | ||
1831 | + switch ((ctrl >> 0) & 3) { | ||
1832 | + case 0: | ||
1833 | + return r->B(i); | ||
1834 | + case 1: | ||
1835 | + return r->W(i); | ||
1836 | + case 2: | ||
1837 | + return (int8_t) r->B(i); | ||
1838 | + case 3: | ||
1839 | + default: | ||
1840 | + return (int16_t) r->W(i); | ||
1841 | + } | ||
1842 | +} | ||
1843 | + | ||
1844 | +static inline unsigned pcmpxstrx(Reg *d, Reg *s, | ||
1845 | + int8_t ctrl, int valids, int validd) | ||
1846 | +{ | ||
1847 | + unsigned int res = 0; | ||
1848 | + int v; | ||
1849 | + int j, i; | ||
1850 | + int upper = (ctrl & 1) ? 7 : 15; | ||
1851 | + | ||
1852 | + valids--; | ||
1853 | + validd--; | ||
1854 | + | ||
1855 | + CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); | ||
1856 | + | ||
1857 | + switch ((ctrl >> 2) & 3) { | ||
1858 | + case 0: | ||
1859 | + for (j = valids; j >= 0; j--) { | ||
1860 | + res <<= 1; | ||
1861 | + v = pcmp_val(s, ctrl, j); | ||
1862 | + for (i = validd; i >= 0; i--) | ||
1863 | + res |= (v == pcmp_val(d, ctrl, i)); | ||
1864 | + } | ||
1865 | + break; | ||
1866 | + case 1: | ||
1867 | + for (j = valids; j >= 0; j--) { | ||
1868 | + res <<= 1; | ||
1869 | + v = pcmp_val(s, ctrl, j); | ||
1870 | + for (i = ((validd - 1) | 1); i >= 0; i -= 2) | ||
1871 | + res |= (pcmp_val(d, ctrl, i - 0) <= v && | ||
1872 | + pcmp_val(d, ctrl, i - 1) >= v); | ||
1873 | + } | ||
1874 | + break; | ||
1875 | + case 2: | ||
1876 | + res = (2 << (upper - MAX(valids, validd))) - 1; | ||
1877 | + res <<= MAX(valids, validd) - MIN(valids, validd); | ||
1878 | + for (i = MIN(valids, validd); i >= 0; i--) { | ||
1879 | + res <<= 1; | ||
1880 | + v = pcmp_val(s, ctrl, i); | ||
1881 | + res |= (v == pcmp_val(d, ctrl, i)); | ||
1882 | + } | ||
1883 | + break; | ||
1884 | + case 3: | ||
1885 | + for (j = valids - validd; j >= 0; j--) { | ||
1886 | + res <<= 1; | ||
1887 | + res |= 1; | ||
1888 | + for (i = MIN(upper - j, validd); i >= 0; i--) | ||
1889 | + res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); | ||
1890 | + } | ||
1891 | + break; | ||
1892 | + } | ||
1893 | + | ||
1894 | + switch ((ctrl >> 4) & 3) { | ||
1895 | + case 1: | ||
1896 | + res ^= (2 << upper) - 1; | ||
1897 | + break; | ||
1898 | + case 3: | ||
1899 | + res ^= (2 << valids) - 1; | ||
1900 | + break; | ||
1901 | + } | ||
1902 | + | ||
1903 | + if (res) | ||
1904 | + CC_SRC |= CC_C; | ||
1905 | + if (res & 1) | ||
1906 | + CC_SRC |= CC_O; | ||
1907 | + | ||
1908 | + return res; | ||
1909 | +} | ||
1910 | + | ||
1911 | +static inline int rffs1(unsigned int val) | ||
1912 | +{ | ||
1913 | + int ret = 1, hi; | ||
1914 | + | ||
1915 | + for (hi = sizeof(val) * 4; hi; hi /= 2) | ||
1916 | + if (val >> hi) { | ||
1917 | + val >>= hi; | ||
1918 | + ret += hi; | ||
1919 | + } | ||
1920 | + | ||
1921 | + return ret; | ||
1922 | +} | ||
1923 | + | ||
1924 | +static inline int ffs1(unsigned int val) | ||
1925 | +{ | ||
1926 | + int ret = 1, hi; | ||
1927 | + | ||
1928 | + for (hi = sizeof(val) * 4; hi; hi /= 2) | ||
1929 | + if (val << hi) { | ||
1930 | + val <<= hi; | ||
1931 | + ret += hi; | ||
1932 | + } | ||
1933 | + | ||
1934 | + return ret; | ||
1935 | +} | ||
1936 | + | ||
1937 | +void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | ||
1938 | +{ | ||
1939 | + unsigned int res = pcmpxstrx(d, s, ctrl, | ||
1940 | + pcmp_elen(R_EDX, ctrl), | ||
1941 | + pcmp_elen(R_EAX, ctrl)); | ||
1942 | + | ||
1943 | + if (res) | ||
1944 | + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; | ||
1945 | + else | ||
1946 | + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); | ||
1947 | +} | ||
1948 | + | ||
1949 | +void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | ||
1950 | +{ | ||
1951 | + int i; | ||
1952 | + unsigned int res = pcmpxstrx(d, s, ctrl, | ||
1953 | + pcmp_elen(R_EDX, ctrl), | ||
1954 | + pcmp_elen(R_EAX, ctrl)); | ||
1955 | + | ||
1956 | + if ((ctrl >> 6) & 1) { | ||
1957 | + if (ctrl & 1) | ||
1958 | + for (i = 0; i <= 8; i--, res >>= 1) | ||
1959 | + d->W(i) = (res & 1) ? ~0 : 0; | ||
1960 | + else | ||
1961 | + for (i = 0; i <= 16; i--, res >>= 1) | ||
1962 | + d->B(i) = (res & 1) ? ~0 : 0; | ||
1963 | + } else { | ||
1964 | + d->Q(1) = 0; | ||
1965 | + d->Q(0) = res; | ||
1966 | + } | ||
1967 | +} | ||
1968 | + | ||
1969 | +void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | ||
1970 | +{ | ||
1971 | + unsigned int res = pcmpxstrx(d, s, ctrl, | ||
1972 | + pcmp_ilen(s, ctrl), | ||
1973 | + pcmp_ilen(d, ctrl)); | ||
1974 | + | ||
1975 | + if (res) | ||
1976 | + env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; | ||
1977 | + else | ||
1978 | + env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); | ||
1979 | +} | ||
1980 | + | ||
1981 | +void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) | ||
1982 | +{ | ||
1983 | + int i; | ||
1984 | + unsigned int res = pcmpxstrx(d, s, ctrl, | ||
1985 | + pcmp_ilen(s, ctrl), | ||
1986 | + pcmp_ilen(d, ctrl)); | ||
1987 | + | ||
1988 | + if ((ctrl >> 6) & 1) { | ||
1989 | + if (ctrl & 1) | ||
1990 | + for (i = 0; i <= 8; i--, res >>= 1) | ||
1991 | + d->W(i) = (res & 1) ? ~0 : 0; | ||
1992 | + else | ||
1993 | + for (i = 0; i <= 16; i--, res >>= 1) | ||
1994 | + d->B(i) = (res & 1) ? ~0 : 0; | ||
1995 | + } else { | ||
1996 | + d->Q(1) = 0; | ||
1997 | + d->Q(0) = res; | ||
1998 | + } | ||
1999 | +} | ||
2000 | + | ||
2001 | +#define CRCPOLY 0x1edc6f41 | ||
2002 | +#define CRCPOLY_BITREV 0x82f63b78 | ||
2003 | +target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) | ||
2004 | +{ | ||
2005 | + target_ulong crc = (msg & ((target_ulong) -1 >> | ||
2006 | + (TARGET_LONG_BITS - len))) ^ crc1; | ||
2007 | + | ||
2008 | + while (len--) | ||
2009 | + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); | ||
2010 | + | ||
2011 | + return crc; | ||
2012 | +} | ||
2013 | + | ||
2014 | +#define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1)) | ||
2015 | +#define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i)) | ||
2016 | +target_ulong helper_popcnt(target_ulong n, uint32_t type) | ||
2017 | +{ | ||
2018 | + CC_SRC = n ? 0 : CC_Z; | ||
2019 | + | ||
2020 | + n = POPCOUNT(n, 0); | ||
2021 | + n = POPCOUNT(n, 1); | ||
2022 | + n = POPCOUNT(n, 2); | ||
2023 | + n = POPCOUNT(n, 3); | ||
2024 | + if (type == 1) | ||
2025 | + return n & 0xff; | ||
2026 | + | ||
2027 | + n = POPCOUNT(n, 4); | ||
2028 | +#ifndef TARGET_X86_64 | ||
2029 | + return n; | ||
2030 | +#else | ||
2031 | + if (type == 2) | ||
2032 | + return n & 0xff; | ||
2033 | + | ||
2034 | + return POPCOUNT(n, 5); | ||
2035 | +#endif | ||
2036 | +} | ||
2037 | +#endif | ||
2038 | + | ||
1423 | #undef SHIFT | 2039 | #undef SHIFT |
1424 | #undef XMM_ONLY | 2040 | #undef XMM_ONLY |
1425 | #undef Reg | 2041 | #undef Reg |
target-i386/ops_sse_header.h
1 | /* | 1 | /* |
2 | - * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/PNI support | 2 | + * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support |
3 | * | 3 | * |
4 | * Copyright (c) 2005 Fabrice Bellard | 4 | * Copyright (c) 2005 Fabrice Bellard |
5 | * | 5 | * |
@@ -269,6 +269,61 @@ DEF_HELPER(void, glue(helper_psignw, SUFFIX), (Reg *d, Reg *s)) | @@ -269,6 +269,61 @@ DEF_HELPER(void, glue(helper_psignw, SUFFIX), (Reg *d, Reg *s)) | ||
269 | DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s)) | 269 | DEF_HELPER(void, glue(helper_psignd, SUFFIX), (Reg *d, Reg *s)) |
270 | DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, int32_t shift)) | 270 | DEF_HELPER(void, glue(helper_palignr, SUFFIX), (Reg *d, Reg *s, int32_t shift)) |
271 | 271 | ||
272 | +/* SSE4.1 op helpers */ | ||
273 | +#if SHIFT == 1 | ||
274 | +DEF_HELPER(void, glue(helper_pblendvb, SUFFIX), (Reg *d, Reg *s)) | ||
275 | +DEF_HELPER(void, glue(helper_blendvps, SUFFIX), (Reg *d, Reg *s)) | ||
276 | +DEF_HELPER(void, glue(helper_blendvpd, SUFFIX), (Reg *d, Reg *s)) | ||
277 | +DEF_HELPER(void, glue(helper_ptest, SUFFIX), (Reg *d, Reg *s)) | ||
278 | +DEF_HELPER(void, glue(helper_pmovsxbw, SUFFIX), (Reg *d, Reg *s)) | ||
279 | +DEF_HELPER(void, glue(helper_pmovsxbd, SUFFIX), (Reg *d, Reg *s)) | ||
280 | +DEF_HELPER(void, glue(helper_pmovsxbq, SUFFIX), (Reg *d, Reg *s)) | ||
281 | +DEF_HELPER(void, glue(helper_pmovsxwd, SUFFIX), (Reg *d, Reg *s)) | ||
282 | +DEF_HELPER(void, glue(helper_pmovsxwq, SUFFIX), (Reg *d, Reg *s)) | ||
283 | +DEF_HELPER(void, glue(helper_pmovsxdq, SUFFIX), (Reg *d, Reg *s)) | ||
284 | +DEF_HELPER(void, glue(helper_pmovzxbw, SUFFIX), (Reg *d, Reg *s)) | ||
285 | +DEF_HELPER(void, glue(helper_pmovzxbd, SUFFIX), (Reg *d, Reg *s)) | ||
286 | +DEF_HELPER(void, glue(helper_pmovzxbq, SUFFIX), (Reg *d, Reg *s)) | ||
287 | +DEF_HELPER(void, glue(helper_pmovzxwd, SUFFIX), (Reg *d, Reg *s)) | ||
288 | +DEF_HELPER(void, glue(helper_pmovzxwq, SUFFIX), (Reg *d, Reg *s)) | ||
289 | +DEF_HELPER(void, glue(helper_pmovzxdq, SUFFIX), (Reg *d, Reg *s)) | ||
290 | +DEF_HELPER(void, glue(helper_pmuldq, SUFFIX), (Reg *d, Reg *s)) | ||
291 | +DEF_HELPER(void, glue(helper_pcmpeqq, SUFFIX), (Reg *d, Reg *s)) | ||
292 | +DEF_HELPER(void, glue(helper_packusdw, SUFFIX), (Reg *d, Reg *s)) | ||
293 | +DEF_HELPER(void, glue(helper_pminsb, SUFFIX), (Reg *d, Reg *s)) | ||
294 | +DEF_HELPER(void, glue(helper_pminsd, SUFFIX), (Reg *d, Reg *s)) | ||
295 | +DEF_HELPER(void, glue(helper_pminuw, SUFFIX), (Reg *d, Reg *s)) | ||
296 | +DEF_HELPER(void, glue(helper_pminud, SUFFIX), (Reg *d, Reg *s)) | ||
297 | +DEF_HELPER(void, glue(helper_pmaxsb, SUFFIX), (Reg *d, Reg *s)) | ||
298 | +DEF_HELPER(void, glue(helper_pmaxsd, SUFFIX), (Reg *d, Reg *s)) | ||
299 | +DEF_HELPER(void, glue(helper_pmaxuw, SUFFIX), (Reg *d, Reg *s)) | ||
300 | +DEF_HELPER(void, glue(helper_pmaxud, SUFFIX), (Reg *d, Reg *s)) | ||
301 | +DEF_HELPER(void, glue(helper_pmulld, SUFFIX), (Reg *d, Reg *s)) | ||
302 | +DEF_HELPER(void, glue(helper_phminposuw, SUFFIX), (Reg *d, Reg *s)) | ||
303 | +DEF_HELPER(void, glue(helper_roundps, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | ||
304 | +DEF_HELPER(void, glue(helper_roundpd, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | ||
305 | +DEF_HELPER(void, glue(helper_roundss, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | ||
306 | +DEF_HELPER(void, glue(helper_roundsd, SUFFIX), (Reg *d, Reg *s, uint32_t mode)) | ||
307 | +DEF_HELPER(void, glue(helper_blendps, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) | ||
308 | +DEF_HELPER(void, glue(helper_blendpd, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) | ||
309 | +DEF_HELPER(void, glue(helper_pblendw, SUFFIX), (Reg *d, Reg *s, uint32_t imm)) | ||
310 | +DEF_HELPER(void, glue(helper_dpps, SUFFIX), (Reg *d, Reg *s, uint32_t mask)) | ||
311 | +DEF_HELPER(void, glue(helper_dppd, SUFFIX), (Reg *d, Reg *s, uint32_t mask)) | ||
312 | +DEF_HELPER(void, glue(helper_mpsadbw, SUFFIX), (Reg *d, Reg *s, uint32_t off)) | ||
313 | +#endif | ||
314 | + | ||
315 | +/* SSE4.2 op helpers */ | ||
316 | +#if SHIFT == 1 | ||
317 | +DEF_HELPER(void, glue(helper_pcmpgtq, SUFFIX), (Reg *d, Reg *s)) | ||
318 | +DEF_HELPER(void, glue(helper_pcmpestri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | ||
319 | +DEF_HELPER(void, glue(helper_pcmpestrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | ||
320 | +DEF_HELPER(void, glue(helper_pcmpistri, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | ||
321 | +DEF_HELPER(void, glue(helper_pcmpistrm, SUFFIX), (Reg *d, Reg *s, uint32_t ctl)) | ||
322 | +DEF_HELPER(target_ulong, helper_crc32, | ||
323 | + (uint32_t crc1, target_ulong msg, uint32_t len)) | ||
324 | +DEF_HELPER(target_ulong, helper_popcnt, (target_ulong n, uint32_t type)) | ||
325 | +#endif | ||
326 | + | ||
272 | #undef SHIFT | 327 | #undef SHIFT |
273 | #undef Reg | 328 | #undef Reg |
274 | #undef SUFFIX | 329 | #undef SUFFIX |
target-i386/translate.c
@@ -2140,7 +2140,7 @@ static void gen_add_A0_ds_seg(DisasContext *s) | @@ -2140,7 +2140,7 @@ static void gen_add_A0_ds_seg(DisasContext *s) | ||
2140 | } | 2140 | } |
2141 | } | 2141 | } |
2142 | 2142 | ||
2143 | -/* generate modrm memory load or store of 'reg'. TMP0 is used if reg != | 2143 | +/* generate modrm memory load or store of 'reg'. TMP0 is used if reg == |
2144 | OR_TMP0 */ | 2144 | OR_TMP0 */ |
2145 | static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store) | 2145 | static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store) |
2146 | { | 2146 | { |
@@ -2770,8 +2770,8 @@ static void *sse_op_table1[256][4] = { | @@ -2770,8 +2770,8 @@ static void *sse_op_table1[256][4] = { | ||
2770 | [0xc2] = SSE_FOP(cmpeq), | 2770 | [0xc2] = SSE_FOP(cmpeq), |
2771 | [0xc6] = { helper_shufps, helper_shufpd }, | 2771 | [0xc6] = { helper_shufps, helper_shufpd }, |
2772 | 2772 | ||
2773 | - [0x38] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ | ||
2774 | - [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3 */ | 2773 | + [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */ |
2774 | + [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */ | ||
2775 | 2775 | ||
2776 | /* MMX ops and their SSE extensions */ | 2776 | /* MMX ops and their SSE extensions */ |
2777 | [0x60] = MMX_OP2(punpcklbw), | 2777 | [0x60] = MMX_OP2(punpcklbw), |
@@ -2924,26 +2924,85 @@ static void *sse_op_table5[256] = { | @@ -2924,26 +2924,85 @@ static void *sse_op_table5[256] = { | ||
2924 | [0xbf] = helper_pavgb_mmx /* pavgusb */ | 2924 | [0xbf] = helper_pavgb_mmx /* pavgusb */ |
2925 | }; | 2925 | }; |
2926 | 2926 | ||
2927 | -static void *sse_op_table6[256][2] = { | ||
2928 | - [0x00] = MMX_OP2(pshufb), | ||
2929 | - [0x01] = MMX_OP2(phaddw), | ||
2930 | - [0x02] = MMX_OP2(phaddd), | ||
2931 | - [0x03] = MMX_OP2(phaddsw), | ||
2932 | - [0x04] = MMX_OP2(pmaddubsw), | ||
2933 | - [0x05] = MMX_OP2(phsubw), | ||
2934 | - [0x06] = MMX_OP2(phsubd), | ||
2935 | - [0x07] = MMX_OP2(phsubsw), | ||
2936 | - [0x08] = MMX_OP2(psignb), | ||
2937 | - [0x09] = MMX_OP2(psignw), | ||
2938 | - [0x0a] = MMX_OP2(psignd), | ||
2939 | - [0x0b] = MMX_OP2(pmulhrsw), | ||
2940 | - [0x1c] = MMX_OP2(pabsb), | ||
2941 | - [0x1d] = MMX_OP2(pabsw), | ||
2942 | - [0x1e] = MMX_OP2(pabsd), | 2927 | +struct sse_op_helper_s { |
2928 | + void *op[2]; uint32_t ext_mask; | ||
2929 | +}; | ||
2930 | +#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 } | ||
2931 | +#define SSE41_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE41 } | ||
2932 | +#define SSE42_OP(x) { { NULL, helper_ ## x ## _xmm }, CPUID_EXT_SSE42 } | ||
2933 | +#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 } | ||
2934 | +static struct sse_op_helper_s sse_op_table6[256] = { | ||
2935 | + [0x00] = SSSE3_OP(pshufb), | ||
2936 | + [0x01] = SSSE3_OP(phaddw), | ||
2937 | + [0x02] = SSSE3_OP(phaddd), | ||
2938 | + [0x03] = SSSE3_OP(phaddsw), | ||
2939 | + [0x04] = SSSE3_OP(pmaddubsw), | ||
2940 | + [0x05] = SSSE3_OP(phsubw), | ||
2941 | + [0x06] = SSSE3_OP(phsubd), | ||
2942 | + [0x07] = SSSE3_OP(phsubsw), | ||
2943 | + [0x08] = SSSE3_OP(psignb), | ||
2944 | + [0x09] = SSSE3_OP(psignw), | ||
2945 | + [0x0a] = SSSE3_OP(psignd), | ||
2946 | + [0x0b] = SSSE3_OP(pmulhrsw), | ||
2947 | + [0x10] = SSE41_OP(pblendvb), | ||
2948 | + [0x14] = SSE41_OP(blendvps), | ||
2949 | + [0x15] = SSE41_OP(blendvpd), | ||
2950 | + [0x17] = SSE41_OP(ptest), | ||
2951 | + [0x1c] = SSSE3_OP(pabsb), | ||
2952 | + [0x1d] = SSSE3_OP(pabsw), | ||
2953 | + [0x1e] = SSSE3_OP(pabsd), | ||
2954 | + [0x20] = SSE41_OP(pmovsxbw), | ||
2955 | + [0x21] = SSE41_OP(pmovsxbd), | ||
2956 | + [0x22] = SSE41_OP(pmovsxbq), | ||
2957 | + [0x23] = SSE41_OP(pmovsxwd), | ||
2958 | + [0x24] = SSE41_OP(pmovsxwq), | ||
2959 | + [0x25] = SSE41_OP(pmovsxdq), | ||
2960 | + [0x28] = SSE41_OP(pmuldq), | ||
2961 | + [0x29] = SSE41_OP(pcmpeqq), | ||
2962 | + [0x2a] = SSE41_SPECIAL, /* movntqda */ | ||
2963 | + [0x2b] = SSE41_OP(packusdw), | ||
2964 | + [0x30] = SSE41_OP(pmovzxbw), | ||
2965 | + [0x31] = SSE41_OP(pmovzxbd), | ||
2966 | + [0x32] = SSE41_OP(pmovzxbq), | ||
2967 | + [0x33] = SSE41_OP(pmovzxwd), | ||
2968 | + [0x34] = SSE41_OP(pmovzxwq), | ||
2969 | + [0x35] = SSE41_OP(pmovzxdq), | ||
2970 | + [0x37] = SSE42_OP(pcmpgtq), | ||
2971 | + [0x38] = SSE41_OP(pminsb), | ||
2972 | + [0x39] = SSE41_OP(pminsd), | ||
2973 | + [0x3a] = SSE41_OP(pminuw), | ||
2974 | + [0x3b] = SSE41_OP(pminud), | ||
2975 | + [0x3c] = SSE41_OP(pmaxsb), | ||
2976 | + [0x3d] = SSE41_OP(pmaxsd), | ||
2977 | + [0x3e] = SSE41_OP(pmaxuw), | ||
2978 | + [0x3f] = SSE41_OP(pmaxud), | ||
2979 | + [0x40] = SSE41_OP(pmulld), | ||
2980 | + [0x41] = SSE41_OP(phminposuw), | ||
2943 | }; | 2981 | }; |
2944 | 2982 | ||
2945 | -static void *sse_op_table7[256][2] = { | ||
2946 | - [0x0f] = MMX_OP2(palignr), | 2983 | +static struct sse_op_helper_s sse_op_table7[256] = { |
2984 | + [0x08] = SSE41_OP(roundps), | ||
2985 | + [0x09] = SSE41_OP(roundpd), | ||
2986 | + [0x0a] = SSE41_OP(roundss), | ||
2987 | + [0x0b] = SSE41_OP(roundsd), | ||
2988 | + [0x0c] = SSE41_OP(blendps), | ||
2989 | + [0x0d] = SSE41_OP(blendpd), | ||
2990 | + [0x0e] = SSE41_OP(pblendw), | ||
2991 | + [0x0f] = SSSE3_OP(palignr), | ||
2992 | + [0x14] = SSE41_SPECIAL, /* pextrb */ | ||
2993 | + [0x15] = SSE41_SPECIAL, /* pextrw */ | ||
2994 | + [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */ | ||
2995 | + [0x17] = SSE41_SPECIAL, /* extractps */ | ||
2996 | + [0x20] = SSE41_SPECIAL, /* pinsrb */ | ||
2997 | + [0x21] = SSE41_SPECIAL, /* insertps */ | ||
2998 | + [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */ | ||
2999 | + [0x40] = SSE41_OP(dpps), | ||
3000 | + [0x41] = SSE41_OP(dppd), | ||
3001 | + [0x42] = SSE41_OP(mpsadbw), | ||
3002 | + [0x60] = SSE42_OP(pcmpestrm), | ||
3003 | + [0x61] = SSE42_OP(pcmpestri), | ||
3004 | + [0x62] = SSE42_OP(pcmpistrm), | ||
3005 | + [0x63] = SSE42_OP(pcmpistri), | ||
2947 | }; | 3006 | }; |
2948 | 3007 | ||
2949 | static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | 3008 | static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) |
@@ -3511,18 +3570,20 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | @@ -3511,18 +3570,20 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | ||
3511 | break; | 3570 | break; |
3512 | case 0x038: | 3571 | case 0x038: |
3513 | case 0x138: | 3572 | case 0x138: |
3514 | - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) | ||
3515 | - goto illegal_op; | ||
3516 | - | ||
3517 | b = modrm; | 3573 | b = modrm; |
3518 | modrm = ldub_code(s->pc++); | 3574 | modrm = ldub_code(s->pc++); |
3519 | rm = modrm & 7; | 3575 | rm = modrm & 7; |
3520 | reg = ((modrm >> 3) & 7) | rex_r; | 3576 | reg = ((modrm >> 3) & 7) | rex_r; |
3521 | mod = (modrm >> 6) & 3; | 3577 | mod = (modrm >> 6) & 3; |
3522 | 3578 | ||
3523 | - sse_op2 = sse_op_table6[b][b1]; | 3579 | + if (s->prefix & PREFIX_REPNZ) |
3580 | + goto crc32; | ||
3581 | + | ||
3582 | + sse_op2 = sse_op_table6[b].op[b1]; | ||
3524 | if (!sse_op2) | 3583 | if (!sse_op2) |
3525 | goto illegal_op; | 3584 | goto illegal_op; |
3585 | + if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask)) | ||
3586 | + goto illegal_op; | ||
3526 | 3587 | ||
3527 | if (b1) { | 3588 | if (b1) { |
3528 | op1_offset = offsetof(CPUX86State,xmm_regs[reg]); | 3589 | op1_offset = offsetof(CPUX86State,xmm_regs[reg]); |
@@ -3531,7 +3592,32 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | @@ -3531,7 +3592,32 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | ||
3531 | } else { | 3592 | } else { |
3532 | op2_offset = offsetof(CPUX86State,xmm_t0); | 3593 | op2_offset = offsetof(CPUX86State,xmm_t0); |
3533 | gen_lea_modrm(s, modrm, ®_addr, &offset_addr); | 3594 | gen_lea_modrm(s, modrm, ®_addr, &offset_addr); |
3534 | - gen_ldo_env_A0(s->mem_index, op2_offset); | 3595 | + switch (b) { |
3596 | + case 0x20: case 0x30: /* pmovsxbw, pmovzxbw */ | ||
3597 | + case 0x23: case 0x33: /* pmovsxwd, pmovzxwd */ | ||
3598 | + case 0x25: case 0x35: /* pmovsxdq, pmovzxdq */ | ||
3599 | + gen_ldq_env_A0(s->mem_index, op2_offset + | ||
3600 | + offsetof(XMMReg, XMM_Q(0))); | ||
3601 | + break; | ||
3602 | + case 0x21: case 0x31: /* pmovsxbd, pmovzxbd */ | ||
3603 | + case 0x24: case 0x34: /* pmovsxwq, pmovzxwq */ | ||
3604 | + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, | ||
3605 | + (s->mem_index >> 2) - 1); | ||
3606 | + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset + | ||
3607 | + offsetof(XMMReg, XMM_L(0))); | ||
3608 | + break; | ||
3609 | + case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */ | ||
3610 | + tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0, | ||
3611 | + (s->mem_index >> 2) - 1); | ||
3612 | + tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset + | ||
3613 | + offsetof(XMMReg, XMM_W(0))); | ||
3614 | + break; | ||
3615 | + case 0x2a: /* movntqda */ | ||
3616 | + gen_ldo_env_A0(s->mem_index, op1_offset); | ||
3617 | + return; | ||
3618 | + default: | ||
3619 | + gen_ldo_env_A0(s->mem_index, op2_offset); | ||
3620 | + } | ||
3535 | } | 3621 | } |
3536 | } else { | 3622 | } else { |
3537 | op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); | 3623 | op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); |
@@ -3543,24 +3629,177 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | @@ -3543,24 +3629,177 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | ||
3543 | gen_ldq_env_A0(s->mem_index, op2_offset); | 3629 | gen_ldq_env_A0(s->mem_index, op2_offset); |
3544 | } | 3630 | } |
3545 | } | 3631 | } |
3632 | + if (sse_op2 == SSE_SPECIAL) | ||
3633 | + goto illegal_op; | ||
3634 | + | ||
3546 | tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); | 3635 | tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); |
3547 | tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); | 3636 | tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); |
3548 | tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1); | 3637 | tcg_gen_helper_0_2(sse_op2, cpu_ptr0, cpu_ptr1); |
3638 | + | ||
3639 | + if (b == 0x17) | ||
3640 | + s->cc_op = CC_OP_EFLAGS; | ||
3549 | break; | 3641 | break; |
3550 | - case 0x03a: | ||
3551 | - case 0x13a: | ||
3552 | - if (!(s->cpuid_ext_features & CPUID_EXT_SSSE3)) | 3642 | + case 0x338: /* crc32 */ |
3643 | + crc32: | ||
3644 | + b = modrm; | ||
3645 | + modrm = ldub_code(s->pc++); | ||
3646 | + reg = ((modrm >> 3) & 7) | rex_r; | ||
3647 | + | ||
3648 | + if (b != 0xf0 && b != 0xf1) | ||
3649 | + goto illegal_op; | ||
3650 | + if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) | ||
3553 | goto illegal_op; | 3651 | goto illegal_op; |
3554 | 3652 | ||
3653 | + if (b == 0xf0) | ||
3654 | + ot = OT_BYTE; | ||
3655 | + else if (b == 0xf1 && s->dflag != 2) | ||
3656 | + if (s->prefix & PREFIX_DATA) | ||
3657 | + ot = OT_WORD; | ||
3658 | + else | ||
3659 | + ot = OT_LONG; | ||
3660 | + else | ||
3661 | + ot = OT_QUAD; | ||
3662 | + | ||
3663 | + gen_op_mov_TN_reg(OT_LONG, 0, reg); | ||
3664 | + tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); | ||
3665 | + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); | ||
3666 | + tcg_gen_helper_1_3(helper_crc32, cpu_T[0], cpu_tmp2_i32, | ||
3667 | + cpu_T[0], tcg_const_i32(8 << ot)); | ||
3668 | + | ||
3669 | + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; | ||
3670 | + gen_op_mov_reg_T0(ot, reg); | ||
3671 | + break; | ||
3672 | + case 0x03a: | ||
3673 | + case 0x13a: | ||
3555 | b = modrm; | 3674 | b = modrm; |
3556 | modrm = ldub_code(s->pc++); | 3675 | modrm = ldub_code(s->pc++); |
3557 | rm = modrm & 7; | 3676 | rm = modrm & 7; |
3558 | reg = ((modrm >> 3) & 7) | rex_r; | 3677 | reg = ((modrm >> 3) & 7) | rex_r; |
3559 | mod = (modrm >> 6) & 3; | 3678 | mod = (modrm >> 6) & 3; |
3560 | 3679 | ||
3561 | - sse_op2 = sse_op_table7[b][b1]; | 3680 | + sse_op2 = sse_op_table7[b].op[b1]; |
3562 | if (!sse_op2) | 3681 | if (!sse_op2) |
3563 | goto illegal_op; | 3682 | goto illegal_op; |
3683 | + if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask)) | ||
3684 | + goto illegal_op; | ||
3685 | + | ||
3686 | + if (sse_op2 == SSE_SPECIAL) { | ||
3687 | + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG; | ||
3688 | + rm = (modrm & 7) | REX_B(s); | ||
3689 | + if (mod != 3) | ||
3690 | + gen_lea_modrm(s, modrm, ®_addr, &offset_addr); | ||
3691 | + reg = ((modrm >> 3) & 7) | rex_r; | ||
3692 | + val = ldub_code(s->pc++); | ||
3693 | + switch (b) { | ||
3694 | + case 0x14: /* pextrb */ | ||
3695 | + tcg_gen_ld8u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | ||
3696 | + xmm_regs[reg].XMM_B(val & 15))); | ||
3697 | + if (mod == 3) | ||
3698 | + gen_op_mov_reg_T0(ot, rm); | ||
3699 | + else | ||
3700 | + tcg_gen_qemu_st8(cpu_T[0], cpu_A0, | ||
3701 | + (s->mem_index >> 2) - 1); | ||
3702 | + break; | ||
3703 | + case 0x15: /* pextrw */ | ||
3704 | + tcg_gen_ld16u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | ||
3705 | + xmm_regs[reg].XMM_W(val & 7))); | ||
3706 | + if (mod == 3) | ||
3707 | + gen_op_mov_reg_T0(ot, rm); | ||
3708 | + else | ||
3709 | + tcg_gen_qemu_st16(cpu_T[0], cpu_A0, | ||
3710 | + (s->mem_index >> 2) - 1); | ||
3711 | + break; | ||
3712 | + case 0x16: | ||
3713 | + if (ot == OT_LONG) { /* pextrd */ | ||
3714 | + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, | ||
3715 | + offsetof(CPUX86State, | ||
3716 | + xmm_regs[reg].XMM_L(val & 3))); | ||
3717 | + if (mod == 3) | ||
3718 | + gen_op_mov_reg_v(ot, rm, cpu_tmp2_i32); | ||
3719 | + else | ||
3720 | + tcg_gen_qemu_st32(cpu_tmp2_i32, cpu_A0, | ||
3721 | + (s->mem_index >> 2) - 1); | ||
3722 | + } else { /* pextrq */ | ||
3723 | + tcg_gen_ld_i64(cpu_tmp1_i64, cpu_env, | ||
3724 | + offsetof(CPUX86State, | ||
3725 | + xmm_regs[reg].XMM_Q(val & 1))); | ||
3726 | + if (mod == 3) | ||
3727 | + gen_op_mov_reg_v(ot, rm, cpu_tmp1_i64); | ||
3728 | + else | ||
3729 | + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, | ||
3730 | + (s->mem_index >> 2) - 1); | ||
3731 | + } | ||
3732 | + break; | ||
3733 | + case 0x17: /* extractps */ | ||
3734 | + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | ||
3735 | + xmm_regs[reg].XMM_L(val & 3))); | ||
3736 | + if (mod == 3) | ||
3737 | + gen_op_mov_reg_T0(ot, rm); | ||
3738 | + else | ||
3739 | + tcg_gen_qemu_st32(cpu_T[0], cpu_A0, | ||
3740 | + (s->mem_index >> 2) - 1); | ||
3741 | + break; | ||
3742 | + case 0x20: /* pinsrb */ | ||
3743 | + if (mod == 3) | ||
3744 | + gen_op_mov_TN_reg(OT_LONG, 0, rm); | ||
3745 | + else | ||
3746 | + tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0, | ||
3747 | + (s->mem_index >> 2) - 1); | ||
3748 | + tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, | ||
3749 | + xmm_regs[reg].XMM_B(val & 15))); | ||
3750 | + break; | ||
3751 | + case 0x21: /* insertps */ | ||
3752 | + if (mod == 3) | ||
3753 | + tcg_gen_ld_i32(cpu_tmp2_i32, cpu_env, | ||
3754 | + offsetof(CPUX86State,xmm_regs[rm] | ||
3755 | + .XMM_L((val >> 6) & 3))); | ||
3756 | + else | ||
3757 | + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, | ||
3758 | + (s->mem_index >> 2) - 1); | ||
3759 | + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, | ||
3760 | + offsetof(CPUX86State,xmm_regs[reg] | ||
3761 | + .XMM_L((val >> 4) & 3))); | ||
3762 | + if ((val >> 0) & 1) | ||
3763 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | ||
3764 | + cpu_env, offsetof(CPUX86State, | ||
3765 | + xmm_regs[reg].XMM_L(0))); | ||
3766 | + if ((val >> 1) & 1) | ||
3767 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | ||
3768 | + cpu_env, offsetof(CPUX86State, | ||
3769 | + xmm_regs[reg].XMM_L(1))); | ||
3770 | + if ((val >> 2) & 1) | ||
3771 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | ||
3772 | + cpu_env, offsetof(CPUX86State, | ||
3773 | + xmm_regs[reg].XMM_L(2))); | ||
3774 | + if ((val >> 3) & 1) | ||
3775 | + tcg_gen_st_i32(tcg_const_i32(0 /*float32_zero*/), | ||
3776 | + cpu_env, offsetof(CPUX86State, | ||
3777 | + xmm_regs[reg].XMM_L(3))); | ||
3778 | + break; | ||
3779 | + case 0x22: | ||
3780 | + if (ot == OT_LONG) { /* pinsrd */ | ||
3781 | + if (mod == 3) | ||
3782 | + gen_op_mov_v_reg(ot, cpu_tmp2_i32, rm); | ||
3783 | + else | ||
3784 | + tcg_gen_qemu_ld32u(cpu_tmp2_i32, cpu_A0, | ||
3785 | + (s->mem_index >> 2) - 1); | ||
3786 | + tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, | ||
3787 | + offsetof(CPUX86State, | ||
3788 | + xmm_regs[reg].XMM_L(val & 3))); | ||
3789 | + } else { /* pinsrq */ | ||
3790 | + if (mod == 3) | ||
3791 | + gen_op_mov_v_reg(ot, cpu_tmp1_i64, rm); | ||
3792 | + else | ||
3793 | + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, | ||
3794 | + (s->mem_index >> 2) - 1); | ||
3795 | + tcg_gen_st_i64(cpu_tmp1_i64, cpu_env, | ||
3796 | + offsetof(CPUX86State, | ||
3797 | + xmm_regs[reg].XMM_Q(val & 1))); | ||
3798 | + } | ||
3799 | + break; | ||
3800 | + } | ||
3801 | + return; | ||
3802 | + } | ||
3564 | 3803 | ||
3565 | if (b1) { | 3804 | if (b1) { |
3566 | op1_offset = offsetof(CPUX86State,xmm_regs[reg]); | 3805 | op1_offset = offsetof(CPUX86State,xmm_regs[reg]); |
@@ -3583,6 +3822,14 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | @@ -3583,6 +3822,14 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) | ||
3583 | } | 3822 | } |
3584 | val = ldub_code(s->pc++); | 3823 | val = ldub_code(s->pc++); |
3585 | 3824 | ||
3825 | + if ((b & 0xfc) == 0x60) { /* pcmpXstrX */ | ||
3826 | + s->cc_op = CC_OP_EFLAGS; | ||
3827 | + | ||
3828 | + if (s->dflag == 2) | ||
3829 | + /* The helper must use entire 64-bit gp registers */ | ||
3830 | + val |= 1 << 8; | ||
3831 | + } | ||
3832 | + | ||
3586 | tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); | 3833 | tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset); |
3587 | tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); | 3834 | tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset); |
3588 | tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, tcg_const_i32(val)); | 3835 | tcg_gen_helper_0_3(sse_op2, cpu_ptr0, cpu_ptr1, tcg_const_i32(val)); |
@@ -7094,7 +7341,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) | @@ -7094,7 +7341,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) | ||
7094 | gen_eob(s); | 7341 | gen_eob(s); |
7095 | } | 7342 | } |
7096 | break; | 7343 | break; |
7097 | - /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3 support */ | 7344 | + /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */ |
7098 | case 0x1c3: /* MOVNTI reg, mem */ | 7345 | case 0x1c3: /* MOVNTI reg, mem */ |
7099 | if (!(s->cpuid_features & CPUID_SSE2)) | 7346 | if (!(s->cpuid_features & CPUID_SSE2)) |
7100 | goto illegal_op; | 7347 | goto illegal_op; |
@@ -7202,6 +7449,28 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) | @@ -7202,6 +7449,28 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) | ||
7202 | tcg_gen_helper_0_0(helper_rsm); | 7449 | tcg_gen_helper_0_0(helper_rsm); |
7203 | gen_eob(s); | 7450 | gen_eob(s); |
7204 | break; | 7451 | break; |
7452 | + case 0x1b8: /* SSE4.2 popcnt */ | ||
7453 | + if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) != | ||
7454 | + PREFIX_REPZ) | ||
7455 | + goto illegal_op; | ||
7456 | + if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT)) | ||
7457 | + goto illegal_op; | ||
7458 | + | ||
7459 | + modrm = ldub_code(s->pc++); | ||
7460 | + reg = ((modrm >> 3) & 7); | ||
7461 | + | ||
7462 | + if (s->prefix & PREFIX_DATA) | ||
7463 | + ot = OT_WORD; | ||
7464 | + else if (s->dflag != 2) | ||
7465 | + ot = OT_LONG; | ||
7466 | + else | ||
7467 | + ot = OT_QUAD; | ||
7468 | + | ||
7469 | + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0); | ||
7470 | + tcg_gen_helper_1_2(helper_popcnt, | ||
7471 | + cpu_T[0], cpu_T[0], tcg_const_i32(ot)); | ||
7472 | + gen_op_mov_reg_T0(ot, reg); | ||
7473 | + break; | ||
7205 | case 0x10e ... 0x10f: | 7474 | case 0x10e ... 0x10f: |
7206 | /* 3DNow! instructions, ignore prefixes */ | 7475 | /* 3DNow! instructions, ignore prefixes */ |
7207 | s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA); | 7476 | s->prefix &= ~(PREFIX_REPZ | PREFIX_REPNZ | PREFIX_DATA); |
tests/test-i386-ssse3.c
1 | /* See if various MMX/SSE SSSE3 instructions give expected results */ | 1 | /* See if various MMX/SSE SSSE3 instructions give expected results */ |
2 | #include <stdio.h> | 2 | #include <stdio.h> |
3 | #include <string.h> | 3 | #include <string.h> |
4 | +#include <stdint.h> | ||
4 | 5 | ||
5 | int main(int argc, char *argv[]) { | 6 | int main(int argc, char *argv[]) { |
6 | char hello[16]; | 7 | char hello[16]; |
@@ -9,9 +10,11 @@ int main(int argc, char *argv[]) { | @@ -9,9 +10,11 @@ int main(int argc, char *argv[]) { | ||
9 | 10 | ||
10 | uint64_t a = 0x0000000000090007; | 11 | uint64_t a = 0x0000000000090007; |
11 | uint64_t b = 0x0000000000000000; | 12 | uint64_t b = 0x0000000000000000; |
13 | + uint32_t c; | ||
14 | + uint16_t d; | ||
12 | 15 | ||
13 | - const char c[16] = "LLOaaaaaaaaaaaaa"; | ||
14 | - const char d[16] = "aaaaaaaaaaaaaaHE"; | 16 | + const char e[16] = "LLOaaaaaaaaaaaaa"; |
17 | + const char f[16] = "aaaaaaaaaaaaaaHE"; | ||
15 | 18 | ||
16 | /* pshufb mm1/xmm1, mm2/xmm2 */ | 19 | /* pshufb mm1/xmm1, mm2/xmm2 */ |
17 | asm volatile ("movq (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1"); | 20 | asm volatile ("movq (%0), %%mm0" : : "r" (ehlo) : "mm0", "mm1"); |
@@ -33,10 +36,22 @@ int main(int argc, char *argv[]) { | @@ -33,10 +36,22 @@ int main(int argc, char *argv[]) { | ||
33 | printf("%i - %i = %i\n", 9, 7, -(int16_t) a); | 36 | printf("%i - %i = %i\n", 9, 7, -(int16_t) a); |
34 | 37 | ||
35 | /* palignr mm1/xmm1, m64/m128, imm8 */ | 38 | /* palignr mm1/xmm1, m64/m128, imm8 */ |
36 | - asm volatile ("movdqa (%0), %%xmm0" : : "r" (c) : "xmm0"); | ||
37 | - asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (d)); | 39 | + asm volatile ("movdqa (%0), %%xmm0" : : "r" (e) : "xmm0"); |
40 | + asm volatile ("palignr $14, (%0), %%xmm0" : : "r" (f)); | ||
38 | asm volatile ("movdqa %%xmm0, (%0)" : : "r" (hello)); | 41 | asm volatile ("movdqa %%xmm0, (%0)" : : "r" (hello)); |
39 | printf("%5.5s\n", hello); | 42 | printf("%5.5s\n", hello); |
40 | 43 | ||
44 | +#if 1 /* SSE4 */ | ||
45 | + /* popcnt r64, r/m64 */ | ||
46 | + asm volatile ("movq $0x8421000010009c63, %%rax" : : : "rax"); | ||
47 | + asm volatile ("popcnt %%ax, %%dx" : : : "dx"); | ||
48 | + asm volatile ("popcnt %%eax, %%ecx" : : : "ecx"); | ||
49 | + asm volatile ("popcnt %rax, %rax"); | ||
50 | + asm volatile ("movq %%rax, %0" : "=m" (a)); | ||
51 | + asm volatile ("movl %%ecx, %0" : "=m" (c)); | ||
52 | + asm volatile ("movw %%dx, %0" : "=m" (d)); | ||
53 | + printf("%i = %i\n%i = %i = %i\n", 13, (int) a, 9, c, d + 1); | ||
54 | +#endif | ||
55 | + | ||
41 | return 0; | 56 | return 0; |
42 | } | 57 | } |