Commit 664e0f195adda3cf01b40d8d1aa79bbc24ad5fab

Authored by bellard
1 parent 085339a1

MMX/SSE support


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@1205 c046a42c-6fe2-441c-8c8c-71466251a162
Changelog
... ... @@ -11,6 +11,7 @@ version 0.6.2:
11 11 - added generic 64 bit target support
12 12 - initial x86_64 target support
13 13 - initial APIC support
  14 + - MMX/SSE/SSE2/PNI support
14 15  
15 16 version 0.6.1:
16 17  
... ...
Makefile.target
... ... @@ -392,7 +392,7 @@ helper.o: helper.c
392 392 $(CC) $(HELPER_CFLAGS) $(DEFINES) -c -o $@ $<
393 393  
394 394 ifeq ($(TARGET_BASE_ARCH), i386)
395   -op.o: op.c opreg_template.h ops_template.h ops_template_mem.h ops_mem.h
  395 +op.o: op.c opreg_template.h ops_template.h ops_template_mem.h ops_mem.h ops_sse.h
396 396 endif
397 397  
398 398 ifeq ($(TARGET_ARCH), arm)
... ...
linux-user/main.c
... ... @@ -1052,8 +1052,8 @@ int main(int argc, char **argv)
1052 1052 cpu_x86_set_cpl(env, 3);
1053 1053  
1054 1054 env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK;
1055   - env->hflags |= HF_PE_MASK;
1056   -
  1055 + env->hflags |= HF_PE_MASK | HF_OSFXSR_MASK;
  1056 +
1057 1057 /* flags setup : we activate the IRQs by default as in user mode */
1058 1058 env->eflags |= IF_MASK;
1059 1059  
... ...
target-i386/cpu.h
... ... @@ -135,6 +135,7 @@
135 135 #define HF_IOPL_SHIFT 12 /* must be same as eflags */
136 136 #define HF_LMA_SHIFT 14 /* only used on x86_64: long mode active */
137 137 #define HF_CS64_SHIFT 15 /* only used on x86_64: 64 bit code segment */
  138 +#define HF_OSFXSR_SHIFT 16 /* CR4.OSFXSR */
138 139 #define HF_VM_SHIFT 17 /* must be same as eflags */
139 140  
140 141 #define HF_CPL_MASK (3 << HF_CPL_SHIFT)
... ... @@ -150,6 +151,7 @@
150 151 #define HF_TS_MASK (1 << HF_TS_SHIFT)
151 152 #define HF_LMA_MASK (1 << HF_LMA_SHIFT)
152 153 #define HF_CS64_MASK (1 << HF_CS64_SHIFT)
  154 +#define HF_OSFXSR_MASK (1 << HF_OSFXSR_SHIFT)
153 155  
154 156 #define CR0_PE_MASK (1 << 0)
155 157 #define CR0_MP_MASK (1 << 1)
... ... @@ -340,10 +342,12 @@ typedef struct SegmentCache {
340 342 } SegmentCache;
341 343  
342 344 typedef union {
343   - uint8_t _b[16];
344   - uint16_t _w[8];
345   - uint32_t _l[4];
346   - uint64_t _q[2];
  345 + uint8_t _b[16];
  346 + uint16_t _w[8];
  347 + uint32_t _l[4];
  348 + uint64_t _q[2];
  349 + float _s[4];
  350 + double _d[2];
347 351 } XMMReg;
348 352  
349 353 typedef union {
... ... @@ -357,7 +361,9 @@ typedef union {
357 361 #define XMM_B(n) _b[15 - (n)]
358 362 #define XMM_W(n) _w[7 - (n)]
359 363 #define XMM_L(n) _l[3 - (n)]
  364 +#define XMM_S(n) _s[3 - (n)]
360 365 #define XMM_Q(n) _q[1 - (n)]
  366 +#define XMM_D(n) _d[1 - (n)]
361 367  
362 368 #define MMX_B(n) _b[7 - (n)]
363 369 #define MMX_W(n) _w[3 - (n)]
... ... @@ -366,12 +372,15 @@ typedef union {
366 372 #define XMM_B(n) _b[n]
367 373 #define XMM_W(n) _w[n]
368 374 #define XMM_L(n) _l[n]
  375 +#define XMM_S(n) _s[n]
369 376 #define XMM_Q(n) _q[n]
  377 +#define XMM_D(n) _d[n]
370 378  
371 379 #define MMX_B(n) _b[n]
372 380 #define MMX_W(n) _w[n]
373 381 #define MMX_L(n) _l[n]
374 382 #endif
  383 +#define MMX_Q(n) q
375 384  
376 385 #ifdef TARGET_X86_64
377 386 #define CPU_NB_REGS 16
... ... @@ -404,7 +413,14 @@ typedef struct CPUX86State {
404 413 unsigned int fpus;
405 414 unsigned int fpuc;
406 415 uint8_t fptags[8]; /* 0 = valid, 1 = empty */
407   - CPU86_LDouble fpregs[8];
  416 + union {
  417 +#ifdef USE_X86LDOUBLE
  418 + CPU86_LDouble d __attribute__((aligned(16)));
  419 +#else
  420 + CPU86_LDouble d;
  421 +#endif
  422 + MMXReg mmx;
  423 + } fpregs[8];
408 424  
409 425 /* emulator internal variables */
410 426 CPU86_LDouble ft0;
... ... @@ -421,9 +437,11 @@ typedef struct CPUX86State {
421 437 SegmentCache tr;
422 438 SegmentCache gdt; /* only base and limit are used */
423 439 SegmentCache idt; /* only base and limit are used */
424   -
  440 +
  441 + uint32_t mxcsr;
425 442 XMMReg xmm_regs[CPU_NB_REGS];
426 443 XMMReg xmm_t0;
  444 + MMXReg mmx_t0;
427 445  
428 446 /* sysenter registers */
429 447 uint32_t sysenter_cs;
... ...
target-i386/exec.h
... ... @@ -131,8 +131,8 @@ extern int loglevel;
131 131  
132 132 /* float macros */
133 133 #define FT0 (env->ft0)
134   -#define ST0 (env->fpregs[env->fpstt])
135   -#define ST(n) (env->fpregs[(env->fpstt + (n)) & 7])
  134 +#define ST0 (env->fpregs[env->fpstt].d)
  135 +#define ST(n) (env->fpregs[(env->fpstt + (n)) & 7].d)
136 136 #define ST1 ST(1)
137 137  
138 138 #ifdef USE_FP_CONVERT
... ... @@ -459,7 +459,7 @@ static inline CPU86_LDouble helper_fldt(target_ulong ptr)
459 459 return temp.d;
460 460 }
461 461  
462   -static inline void helper_fstt(CPU86_LDouble f, uint8_t *ptr)
  462 +static inline void helper_fstt(CPU86_LDouble f, target_ulong ptr)
463 463 {
464 464 CPU86_LDoubleU temp;
465 465 int e;
... ... @@ -557,6 +557,9 @@ void helper_fxsave(target_ulong ptr, int data64);
557 557 void helper_fxrstor(target_ulong ptr, int data64);
558 558 void restore_native_fp_state(CPUState *env);
559 559 void save_native_fp_state(CPUState *env);
  560 +float approx_rsqrt(float a);
  561 +float approx_rcp(float a);
  562 +int fpu_isnan(double a);
560 563  
561 564 extern const uint8_t parity_table[256];
562 565 extern const uint8_t rclw_table[32];
... ...
target-i386/helper.c
... ... @@ -2444,7 +2444,7 @@ void helper_fldt_ST0_A0(void)
2444 2444 {
2445 2445 int new_fpstt;
2446 2446 new_fpstt = (env->fpstt - 1) & 7;
2447   - env->fpregs[new_fpstt] = helper_fldt(A0);
  2447 + env->fpregs[new_fpstt].d = helper_fldt(A0);
2448 2448 env->fpstt = new_fpstt;
2449 2449 env->fptags[new_fpstt] = 0; /* validate stack entry */
2450 2450 }
... ... @@ -2804,9 +2804,10 @@ void helper_fstenv(target_ulong ptr, int data32)
2804 2804 if (env->fptags[i]) {
2805 2805 fptag |= 3;
2806 2806 } else {
2807   - tmp.d = env->fpregs[i];
  2807 + tmp.d = env->fpregs[i].d;
2808 2808 exp = EXPD(tmp);
2809 2809 mant = MANTD(tmp);
  2810 + printf("mant=%llx exp=%x\n", mant, exp);
2810 2811 if (exp == 0 && mant == 0) {
2811 2812 /* zero */
2812 2813 fptag |= 1;
... ... @@ -2930,7 +2931,7 @@ void helper_fxsave(target_ulong ptr, int data64)
2930 2931  
2931 2932 if (env->cr[4] & CR4_OSFXSR_MASK) {
2932 2933 /* XXX: finish it */
2933   - stl(ptr + 0x18, 0); /* mxcsr */
  2934 + stl(ptr + 0x18, env->mxcsr); /* mxcsr */
2934 2935 stl(ptr + 0x1c, 0); /* mxcsr_mask */
2935 2936 nb_xmm_regs = 8 << data64;
2936 2937 addr = ptr + 0xa0;
... ... @@ -2967,7 +2968,7 @@ void helper_fxrstor(target_ulong ptr, int data64)
2967 2968  
2968 2969 if (env->cr[4] & CR4_OSFXSR_MASK) {
2969 2970 /* XXX: finish it, endianness */
2970   - //ldl(ptr + 0x18);
  2971 + env->mxcsr = ldl(ptr + 0x18);
2971 2972 //ldl(ptr + 0x1c);
2972 2973 nb_xmm_regs = 8 << data64;
2973 2974 addr = ptr + 0xa0;
... ... @@ -3209,6 +3210,23 @@ void helper_idivq_EAX_T0(void)
3209 3210  
3210 3211 #endif
3211 3212  
  3213 +/* XXX: do it */
  3214 +int fpu_isnan(double a)
  3215 +{
  3216 + return 0;
  3217 +}
  3218 +
  3219 +float approx_rsqrt(float a)
  3220 +{
  3221 + return 1.0 / sqrt(a);
  3222 +}
  3223 +
  3224 +float approx_rcp(float a)
  3225 +{
  3226 + return 1.0 / a;
  3227 +}
  3228 +
  3229 +
3212 3230 #if !defined(CONFIG_USER_ONLY)
3213 3231  
3214 3232 #define MMUSUFFIX _mmu
... ...
target-i386/helper2.c
... ... @@ -158,6 +158,8 @@ void cpu_reset(CPUX86State *env)
158 158 for(i = 0;i < 8; i++)
159 159 env->fptags[i] = 1;
160 160 env->fpuc = 0x37f;
  161 +
  162 + env->mxcsr = 0x1f80;
161 163 }
162 164  
163 165 void cpu_x86_close(CPUX86State *env)
... ... @@ -376,15 +378,15 @@ void cpu_dump_state(CPUState *env, FILE *f,
376 378 }
377 379 if (flags & X86_DUMP_FPU) {
378 380 cpu_fprintf(f, "ST0=%f ST1=%f ST2=%f ST3=%f\n",
379   - (double)env->fpregs[0],
380   - (double)env->fpregs[1],
381   - (double)env->fpregs[2],
382   - (double)env->fpregs[3]);
  381 + (double)env->fpregs[0].d,
  382 + (double)env->fpregs[1].d,
  383 + (double)env->fpregs[2].d,
  384 + (double)env->fpregs[3].d);
383 385 cpu_fprintf(f, "ST4=%f ST5=%f ST6=%f ST7=%f\n",
384   - (double)env->fpregs[4],
385   - (double)env->fpregs[5],
386   - (double)env->fpregs[7],
387   - (double)env->fpregs[8]);
  386 + (double)env->fpregs[4].d,
  387 + (double)env->fpregs[5].d,
  388 + (double)env->fpregs[7].d,
  389 + (double)env->fpregs[8].d);
388 390 }
389 391 }
390 392  
... ... @@ -471,6 +473,14 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
471 473 (env->cr[4] & (CR4_PGE_MASK | CR4_PAE_MASK | CR4_PSE_MASK))) {
472 474 tlb_flush(env, 1);
473 475 }
  476 + /* SSE handling */
  477 + if (!(env->cpuid_features & CPUID_SSE))
  478 + new_cr4 &= ~CR4_OSFXSR_MASK;
  479 + if (new_cr4 & CR4_OSFXSR_MASK)
  480 + env->hflags |= HF_OSFXSR_MASK;
  481 + else
  482 + env->hflags &= ~HF_OSFXSR_MASK;
  483 +
474 484 env->cr[4] = new_cr4;
475 485 }
476 486  
... ... @@ -800,7 +810,7 @@ void restore_native_fp_state(CPUState *env)
800 810 fp->fptag = fptag;
801 811 j = env->fpstt;
802 812 for(i = 0;i < 8; i++) {
803   - memcpy(&fp->fpregs1[i * 10], &env->fpregs[j], 10);
  813 + memcpy(&fp->fpregs1[i * 10], &env->fpregs[j].d, 10);
804 814 j = (j + 1) & 7;
805 815 }
806 816 asm volatile ("frstor %0" : "=m" (*fp));
... ... @@ -824,7 +834,7 @@ void save_native_fp_state(CPUState *env)
824 834 }
825 835 j = env->fpstt;
826 836 for(i = 0;i < 8; i++) {
827   - memcpy(&env->fpregs[j], &fp->fpregs1[i * 10], 10);
  837 + memcpy(&env->fpregs[j].d, &fp->fpregs1[i * 10], 10);
828 838 j = (j + 1) & 7;
829 839 }
830 840 /* we must restore the default rounding state */
... ...
target-i386/op.c
... ... @@ -752,11 +752,6 @@ void OPPROTO op_movswl_T0_T0(void)
752 752 T0 = (int16_t)T0;
753 753 }
754 754  
755   -void OPPROTO op_movslq_T0_T0(void)
756   -{
757   - T0 = (int32_t)T0;
758   -}
759   -
760 755 void OPPROTO op_movzwl_T0_T0(void)
761 756 {
762 757 T0 = (uint16_t)T0;
... ... @@ -768,6 +763,11 @@ void OPPROTO op_movswl_EAX_AX(void)
768 763 }
769 764  
770 765 #ifdef TARGET_X86_64
  766 +void OPPROTO op_movslq_T0_T0(void)
  767 +{
  768 + T0 = (int32_t)T0;
  769 +}
  770 +
771 771 void OPPROTO op_movslq_RAX_EAX(void)
772 772 {
773 773 EAX = (int32_t)EAX;
... ... @@ -1695,9 +1695,9 @@ void OPPROTO op_flds_ST0_A0(void)
1695 1695 new_fpstt = (env->fpstt - 1) & 7;
1696 1696 #ifdef USE_FP_CONVERT
1697 1697 FP_CONVERT.i32 = ldl(A0);
1698   - env->fpregs[new_fpstt] = FP_CONVERT.f;
  1698 + env->fpregs[new_fpstt].d = FP_CONVERT.f;
1699 1699 #else
1700   - env->fpregs[new_fpstt] = ldfl(A0);
  1700 + env->fpregs[new_fpstt].d = ldfl(A0);
1701 1701 #endif
1702 1702 env->fpstt = new_fpstt;
1703 1703 env->fptags[new_fpstt] = 0; /* validate stack entry */
... ... @@ -1709,9 +1709,9 @@ void OPPROTO op_fldl_ST0_A0(void)
1709 1709 new_fpstt = (env->fpstt - 1) & 7;
1710 1710 #ifdef USE_FP_CONVERT
1711 1711 FP_CONVERT.i64 = ldq(A0);
1712   - env->fpregs[new_fpstt] = FP_CONVERT.d;
  1712 + env->fpregs[new_fpstt].d = FP_CONVERT.d;
1713 1713 #else
1714   - env->fpregs[new_fpstt] = ldfq(A0);
  1714 + env->fpregs[new_fpstt].d = ldfq(A0);
1715 1715 #endif
1716 1716 env->fpstt = new_fpstt;
1717 1717 env->fptags[new_fpstt] = 0; /* validate stack entry */
... ... @@ -1729,7 +1729,7 @@ void helper_fild_ST0_A0(void)
1729 1729 {
1730 1730 int new_fpstt;
1731 1731 new_fpstt = (env->fpstt - 1) & 7;
1732   - env->fpregs[new_fpstt] = (CPU86_LDouble)ldsw(A0);
  1732 + env->fpregs[new_fpstt].d = (CPU86_LDouble)ldsw(A0);
1733 1733 env->fpstt = new_fpstt;
1734 1734 env->fptags[new_fpstt] = 0; /* validate stack entry */
1735 1735 }
... ... @@ -1738,7 +1738,7 @@ void helper_fildl_ST0_A0(void)
1738 1738 {
1739 1739 int new_fpstt;
1740 1740 new_fpstt = (env->fpstt - 1) & 7;
1741   - env->fpregs[new_fpstt] = (CPU86_LDouble)((int32_t)ldl(A0));
  1741 + env->fpregs[new_fpstt].d = (CPU86_LDouble)((int32_t)ldl(A0));
1742 1742 env->fpstt = new_fpstt;
1743 1743 env->fptags[new_fpstt] = 0; /* validate stack entry */
1744 1744 }
... ... @@ -1747,7 +1747,7 @@ void helper_fildll_ST0_A0(void)
1747 1747 {
1748 1748 int new_fpstt;
1749 1749 new_fpstt = (env->fpstt - 1) & 7;
1750   - env->fpregs[new_fpstt] = (CPU86_LDouble)((int64_t)ldq(A0));
  1750 + env->fpregs[new_fpstt].d = (CPU86_LDouble)((int64_t)ldq(A0));
1751 1751 env->fpstt = new_fpstt;
1752 1752 env->fptags[new_fpstt] = 0; /* validate stack entry */
1753 1753 }
... ... @@ -1775,9 +1775,9 @@ void OPPROTO op_fild_ST0_A0(void)
1775 1775 new_fpstt = (env->fpstt - 1) & 7;
1776 1776 #ifdef USE_FP_CONVERT
1777 1777 FP_CONVERT.i32 = ldsw(A0);
1778   - env->fpregs[new_fpstt] = (CPU86_LDouble)FP_CONVERT.i32;
  1778 + env->fpregs[new_fpstt].d = (CPU86_LDouble)FP_CONVERT.i32;
1779 1779 #else
1780   - env->fpregs[new_fpstt] = (CPU86_LDouble)ldsw(A0);
  1780 + env->fpregs[new_fpstt].d = (CPU86_LDouble)ldsw(A0);
1781 1781 #endif
1782 1782 env->fpstt = new_fpstt;
1783 1783 env->fptags[new_fpstt] = 0; /* validate stack entry */
... ... @@ -1789,9 +1789,9 @@ void OPPROTO op_fildl_ST0_A0(void)
1789 1789 new_fpstt = (env->fpstt - 1) & 7;
1790 1790 #ifdef USE_FP_CONVERT
1791 1791 FP_CONVERT.i32 = (int32_t) ldl(A0);
1792   - env->fpregs[new_fpstt] = (CPU86_LDouble)FP_CONVERT.i32;
  1792 + env->fpregs[new_fpstt].d = (CPU86_LDouble)FP_CONVERT.i32;
1793 1793 #else
1794   - env->fpregs[new_fpstt] = (CPU86_LDouble)((int32_t)ldl(A0));
  1794 + env->fpregs[new_fpstt].d = (CPU86_LDouble)((int32_t)ldl(A0));
1795 1795 #endif
1796 1796 env->fpstt = new_fpstt;
1797 1797 env->fptags[new_fpstt] = 0; /* validate stack entry */
... ... @@ -1803,9 +1803,9 @@ void OPPROTO op_fildll_ST0_A0(void)
1803 1803 new_fpstt = (env->fpstt - 1) & 7;
1804 1804 #ifdef USE_FP_CONVERT
1805 1805 FP_CONVERT.i64 = (int64_t) ldq(A0);
1806   - env->fpregs[new_fpstt] = (CPU86_LDouble)FP_CONVERT.i64;
  1806 + env->fpregs[new_fpstt].d = (CPU86_LDouble)FP_CONVERT.i64;
1807 1807 #else
1808   - env->fpregs[new_fpstt] = (CPU86_LDouble)((int64_t)ldq(A0));
  1808 + env->fpregs[new_fpstt].d = (CPU86_LDouble)((int64_t)ldq(A0));
1809 1809 #endif
1810 1810 env->fpstt = new_fpstt;
1811 1811 env->fptags[new_fpstt] = 0; /* validate stack entry */
... ... @@ -2322,6 +2322,29 @@ void OPPROTO op_movo(void)
2322 2322 memcpy16(d, s);
2323 2323 }
2324 2324  
  2325 +void OPPROTO op_movq(void)
  2326 +{
  2327 + uint64_t *d, *s;
  2328 + d = (uint64_t *)((char *)env + PARAM1);
  2329 + s = (uint64_t *)((char *)env + PARAM2);
  2330 + *d = *s;
  2331 +}
  2332 +
  2333 +void OPPROTO op_movl(void)
  2334 +{
  2335 + uint32_t *d, *s;
  2336 + d = (uint32_t *)((char *)env + PARAM1);
  2337 + s = (uint32_t *)((char *)env + PARAM2);
  2338 + *d = *s;
  2339 +}
  2340 +
  2341 +void OPPROTO op_movq_env_0(void)
  2342 +{
  2343 + uint64_t *d;
  2344 + d = (uint64_t *)((char *)env + PARAM1);
  2345 + *d = 0;
  2346 +}
  2347 +
2325 2348 void OPPROTO op_fxsave_A0(void)
2326 2349 {
2327 2350 helper_fxsave(A0, PARAM1);
... ... @@ -2331,3 +2354,24 @@ void OPPROTO op_fxrstor_A0(void)
2331 2354 {
2332 2355 helper_fxrstor(A0, PARAM1);
2333 2356 }
  2357 +
  2358 +/* XXX: optimize by storing fptt and fptags in the static cpu state */
  2359 +void OPPROTO op_enter_mmx(void)
  2360 +{
  2361 + env->fpstt = 0;
  2362 + *(uint32_t *)(env->fptags) = 0;
  2363 + *(uint32_t *)(env->fptags + 4) = 0;
  2364 +}
  2365 +
  2366 +void OPPROTO op_emms(void)
  2367 +{
  2368 + /* set to empty state */
  2369 + *(uint32_t *)(env->fptags) = 0x01010101;
  2370 + *(uint32_t *)(env->fptags + 4) = 0x01010101;
  2371 +}
  2372 +
  2373 +#define SHIFT 0
  2374 +#include "ops_sse.h"
  2375 +
  2376 +#define SHIFT 1
  2377 +#include "ops_sse.h"
... ...
target-i386/ops_mem.h
... ... @@ -80,7 +80,21 @@ void OPPROTO glue(glue(op_stl, MEMSUFFIX), _T1_A0)(void)
80 80 glue(stl, MEMSUFFIX)(A0, T1);
81 81 }
82 82  
83   -/* SSE support */
  83 +/* SSE/MMX support */
  84 +void OPPROTO glue(glue(op_ldq, MEMSUFFIX), _env_A0)(void)
  85 +{
  86 + uint64_t *p;
  87 + p = (uint64_t *)((char *)env + PARAM1);
  88 + *p = glue(ldq, MEMSUFFIX)(A0);
  89 +}
  90 +
  91 +void OPPROTO glue(glue(op_stq, MEMSUFFIX), _env_A0)(void)
  92 +{
  93 + uint64_t *p;
  94 + p = (uint64_t *)((char *)env + PARAM1);
  95 + glue(stq, MEMSUFFIX)(A0, *p);
  96 +}
  97 +
84 98 void OPPROTO glue(glue(op_ldo, MEMSUFFIX), _env_A0)(void)
85 99 {
86 100 XMMReg *p;
... ...
target-i386/ops_sse.h 0 โ†’ 100644
  1 +/*
  2 + * MMX/SSE/SSE2/PNI support
  3 + *
  4 + * Copyright (c) 2005 Fabrice Bellard
  5 + *
  6 + * This library is free software; you can redistribute it and/or
  7 + * modify it under the terms of the GNU Lesser General Public
  8 + * License as published by the Free Software Foundation; either
  9 + * version 2 of the License, or (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 + * Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public
  17 + * License along with this library; if not, write to the Free Software
  18 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19 + */
  20 +#if SHIFT == 0
  21 +#define Reg MMXReg
  22 +#define XMM_ONLY(x...)
  23 +#define B(n) MMX_B(n)
  24 +#define W(n) MMX_W(n)
  25 +#define L(n) MMX_L(n)
  26 +#define Q(n) q
  27 +#define SUFFIX _mmx
  28 +#else
  29 +#define Reg XMMReg
  30 +#define XMM_ONLY(x...) x
  31 +#define B(n) XMM_B(n)
  32 +#define W(n) XMM_W(n)
  33 +#define L(n) XMM_L(n)
  34 +#define Q(n) XMM_Q(n)
  35 +#define SUFFIX _xmm
  36 +#endif
  37 +
  38 +void OPPROTO glue(op_psrlw, SUFFIX)(void)
  39 +{
  40 + Reg *d, *s;
  41 + int shift;
  42 +
  43 + d = (Reg *)((char *)env + PARAM1);
  44 + s = (Reg *)((char *)env + PARAM2);
  45 +
  46 + if (s->Q(0) > 15) {
  47 + d->Q(0) = 0;
  48 +#if SHIFT == 1
  49 + d->Q(1) = 0;
  50 +#endif
  51 + } else {
  52 + shift = s->B(0);
  53 + d->W(0) >>= shift;
  54 + d->W(1) >>= shift;
  55 + d->W(2) >>= shift;
  56 + d->W(3) >>= shift;
  57 +#if SHIFT == 1
  58 + d->W(4) >>= shift;
  59 + d->W(5) >>= shift;
  60 + d->W(6) >>= shift;
  61 + d->W(7) >>= shift;
  62 +#endif
  63 + }
  64 +}
  65 +
  66 +void OPPROTO glue(op_psraw, SUFFIX)(void)
  67 +{
  68 + Reg *d, *s;
  69 + int shift;
  70 +
  71 + d = (Reg *)((char *)env + PARAM1);
  72 + s = (Reg *)((char *)env + PARAM2);
  73 +
  74 + if (s->Q(0) > 15) {
  75 + shift = 15;
  76 + } else {
  77 + shift = s->B(0);
  78 + }
  79 + d->W(0) = (int16_t)d->W(0) >> shift;
  80 + d->W(1) = (int16_t)d->W(1) >> shift;
  81 + d->W(2) = (int16_t)d->W(2) >> shift;
  82 + d->W(3) = (int16_t)d->W(3) >> shift;
  83 +#if SHIFT == 1
  84 + d->W(4) = (int16_t)d->W(4) >> shift;
  85 + d->W(5) = (int16_t)d->W(5) >> shift;
  86 + d->W(6) = (int16_t)d->W(6) >> shift;
  87 + d->W(7) = (int16_t)d->W(7) >> shift;
  88 +#endif
  89 +}
  90 +
  91 +void OPPROTO glue(op_psllw, SUFFIX)(void)
  92 +{
  93 + Reg *d, *s;
  94 + int shift;
  95 +
  96 + d = (Reg *)((char *)env + PARAM1);
  97 + s = (Reg *)((char *)env + PARAM2);
  98 +
  99 + if (s->Q(0) > 15) {
  100 + d->Q(0) = 0;
  101 +#if SHIFT == 1
  102 + d->Q(1) = 0;
  103 +#endif
  104 + } else {
  105 + shift = s->B(0);
  106 + d->W(0) <<= shift;
  107 + d->W(1) <<= shift;
  108 + d->W(2) <<= shift;
  109 + d->W(3) <<= shift;
  110 +#if SHIFT == 1
  111 + d->W(4) <<= shift;
  112 + d->W(5) <<= shift;
  113 + d->W(6) <<= shift;
  114 + d->W(7) <<= shift;
  115 +#endif
  116 + }
  117 +}
  118 +
  119 +void OPPROTO glue(op_psrld, SUFFIX)(void)
  120 +{
  121 + Reg *d, *s;
  122 + int shift;
  123 +
  124 + d = (Reg *)((char *)env + PARAM1);
  125 + s = (Reg *)((char *)env + PARAM2);
  126 +
  127 + if (s->Q(0) > 31) {
  128 + d->Q(0) = 0;
  129 +#if SHIFT == 1
  130 + d->Q(1) = 0;
  131 +#endif
  132 + } else {
  133 + shift = s->B(0);
  134 + d->L(0) >>= shift;
  135 + d->L(1) >>= shift;
  136 +#if SHIFT == 1
  137 + d->L(2) >>= shift;
  138 + d->L(3) >>= shift;
  139 +#endif
  140 + }
  141 +}
  142 +
  143 +void OPPROTO glue(op_psrad, SUFFIX)(void)
  144 +{
  145 + Reg *d, *s;
  146 + int shift;
  147 +
  148 + d = (Reg *)((char *)env + PARAM1);
  149 + s = (Reg *)((char *)env + PARAM2);
  150 +
  151 + if (s->Q(0) > 31) {
  152 + shift = 31;
  153 + } else {
  154 + shift = s->B(0);
  155 + }
  156 + d->L(0) = (int32_t)d->L(0) >> shift;
  157 + d->L(1) = (int32_t)d->L(1) >> shift;
  158 +#if SHIFT == 1
  159 + d->L(2) = (int32_t)d->L(2) >> shift;
  160 + d->L(3) = (int32_t)d->L(3) >> shift;
  161 +#endif
  162 +}
  163 +
  164 +void OPPROTO glue(op_pslld, SUFFIX)(void)
  165 +{
  166 + Reg *d, *s;
  167 + int shift;
  168 +
  169 + d = (Reg *)((char *)env + PARAM1);
  170 + s = (Reg *)((char *)env + PARAM2);
  171 +
  172 + if (s->Q(0) > 31) {
  173 + d->Q(0) = 0;
  174 +#if SHIFT == 1
  175 + d->Q(1) = 0;
  176 +#endif
  177 + } else {
  178 + shift = s->B(0);
  179 + d->L(0) <<= shift;
  180 + d->L(1) <<= shift;
  181 +#if SHIFT == 1
  182 + d->L(2) <<= shift;
  183 + d->L(3) <<= shift;
  184 +#endif
  185 + }
  186 +}
  187 +
  188 +void OPPROTO glue(op_psrlq, SUFFIX)(void)
  189 +{
  190 + Reg *d, *s;
  191 + int shift;
  192 +
  193 + d = (Reg *)((char *)env + PARAM1);
  194 + s = (Reg *)((char *)env + PARAM2);
  195 +
  196 + if (s->Q(0) > 63) {
  197 + d->Q(0) = 0;
  198 +#if SHIFT == 1
  199 + d->Q(1) = 0;
  200 +#endif
  201 + } else {
  202 + shift = s->B(0);
  203 + d->Q(0) >>= shift;
  204 +#if SHIFT == 1
  205 + d->Q(1) >>= shift;
  206 +#endif
  207 + }
  208 +}
  209 +
  210 +void OPPROTO glue(op_psllq, SUFFIX)(void)
  211 +{
  212 + Reg *d, *s;
  213 + int shift;
  214 +
  215 + d = (Reg *)((char *)env + PARAM1);
  216 + s = (Reg *)((char *)env + PARAM2);
  217 +
  218 + if (s->Q(0) > 63) {
  219 + d->Q(0) = 0;
  220 +#if SHIFT == 1
  221 + d->Q(1) = 0;
  222 +#endif
  223 + } else {
  224 + shift = s->B(0);
  225 + d->Q(0) <<= shift;
  226 +#if SHIFT == 1
  227 + d->Q(1) <<= shift;
  228 +#endif
  229 + }
  230 +}
  231 +
  232 +#if SHIFT == 1
  233 +void OPPROTO glue(op_psrldq, SUFFIX)(void)
  234 +{
  235 + Reg *d, *s;
  236 + int shift, i;
  237 +
  238 + d = (Reg *)((char *)env + PARAM1);
  239 + s = (Reg *)((char *)env + PARAM2);
  240 + shift = s->L(0);
  241 + if (shift > 16)
  242 + shift = 16;
  243 + for(i = 0; i < 16 - shift; i++)
  244 + d->B(i) = d->B(i + shift);
  245 + for(i = 16 - shift; i < 16; i++)
  246 + d->B(i) = 0;
  247 + FORCE_RET();
  248 +}
  249 +
  250 +void OPPROTO glue(op_pslldq, SUFFIX)(void)
  251 +{
  252 + Reg *d, *s;
  253 + int shift, i;
  254 +
  255 + d = (Reg *)((char *)env + PARAM1);
  256 + s = (Reg *)((char *)env + PARAM2);
  257 + shift = s->L(0);
  258 + if (shift > 16)
  259 + shift = 16;
  260 + for(i = 15; i >= shift; i--)
  261 + d->B(i) = d->B(i - shift);
  262 + for(i = 0; i < shift; i++)
  263 + d->B(i) = 0;
  264 + FORCE_RET();
  265 +}
  266 +#endif
  267 +
  268 +#define SSE_OP_B(name, F)\
  269 +void OPPROTO glue(name, SUFFIX) (void)\
  270 +{\
  271 + Reg *d, *s;\
  272 + d = (Reg *)((char *)env + PARAM1);\
  273 + s = (Reg *)((char *)env + PARAM2);\
  274 + d->B(0) = F(d->B(0), s->B(0));\
  275 + d->B(1) = F(d->B(1), s->B(1));\
  276 + d->B(2) = F(d->B(2), s->B(2));\
  277 + d->B(3) = F(d->B(3), s->B(3));\
  278 + d->B(4) = F(d->B(4), s->B(4));\
  279 + d->B(5) = F(d->B(5), s->B(5));\
  280 + d->B(6) = F(d->B(6), s->B(6));\
  281 + d->B(7) = F(d->B(7), s->B(7));\
  282 + XMM_ONLY(\
  283 + d->B(8) = F(d->B(8), s->B(8));\
  284 + d->B(9) = F(d->B(9), s->B(9));\
  285 + d->B(10) = F(d->B(10), s->B(10));\
  286 + d->B(11) = F(d->B(11), s->B(11));\
  287 + d->B(12) = F(d->B(12), s->B(12));\
  288 + d->B(13) = F(d->B(13), s->B(13));\
  289 + d->B(14) = F(d->B(14), s->B(14));\
  290 + d->B(15) = F(d->B(15), s->B(15));\
  291 + )\
  292 +}
  293 +
  294 +#define SSE_OP_W(name, F)\
  295 +void OPPROTO glue(name, SUFFIX) (void)\
  296 +{\
  297 + Reg *d, *s;\
  298 + d = (Reg *)((char *)env + PARAM1);\
  299 + s = (Reg *)((char *)env + PARAM2);\
  300 + d->W(0) = F(d->W(0), s->W(0));\
  301 + d->W(1) = F(d->W(1), s->W(1));\
  302 + d->W(2) = F(d->W(2), s->W(2));\
  303 + d->W(3) = F(d->W(3), s->W(3));\
  304 + XMM_ONLY(\
  305 + d->W(4) = F(d->W(4), s->W(4));\
  306 + d->W(5) = F(d->W(5), s->W(5));\
  307 + d->W(6) = F(d->W(6), s->W(6));\
  308 + d->W(7) = F(d->W(7), s->W(7));\
  309 + )\
  310 +}
  311 +
  312 +#define SSE_OP_L(name, F)\
  313 +void OPPROTO glue(name, SUFFIX) (void)\
  314 +{\
  315 + Reg *d, *s;\
  316 + d = (Reg *)((char *)env + PARAM1);\
  317 + s = (Reg *)((char *)env + PARAM2);\
  318 + d->L(0) = F(d->L(0), s->L(0));\
  319 + d->L(1) = F(d->L(1), s->L(1));\
  320 + XMM_ONLY(\
  321 + d->L(2) = F(d->L(2), s->L(2));\
  322 + d->L(3) = F(d->L(3), s->L(3));\
  323 + )\
  324 +}
  325 +
  326 +#define SSE_OP_Q(name, F)\
  327 +void OPPROTO glue(name, SUFFIX) (void)\
  328 +{\
  329 + Reg *d, *s;\
  330 + d = (Reg *)((char *)env + PARAM1);\
  331 + s = (Reg *)((char *)env + PARAM2);\
  332 + d->Q(0) = F(d->Q(0), s->Q(0));\
  333 + XMM_ONLY(\
  334 + d->Q(1) = F(d->Q(1), s->Q(1));\
  335 + )\
  336 +}
  337 +
  338 +#if SHIFT == 0
  339 +static inline int satub(int x)
  340 +{
  341 + if (x < 0)
  342 + return 0;
  343 + else if (x > 255)
  344 + return 255;
  345 + else
  346 + return x;
  347 +}
  348 +
  349 +static inline int satuw(int x)
  350 +{
  351 + if (x < 0)
  352 + return 0;
  353 + else if (x > 65535)
  354 + return 65535;
  355 + else
  356 + return x;
  357 +}
  358 +
  359 +static inline int satsb(int x)
  360 +{
  361 + if (x < -128)
  362 + return -128;
  363 + else if (x > 127)
  364 + return 127;
  365 + else
  366 + return x;
  367 +}
  368 +
  369 +static inline int satsw(int x)
  370 +{
  371 + if (x < -32768)
  372 + return -32768;
  373 + else if (x > 32767)
  374 + return 32767;
  375 + else
  376 + return x;
  377 +}
  378 +
  379 +#define FADD(a, b) ((a) + (b))
  380 +#define FADDUB(a, b) satub((a) + (b))
  381 +#define FADDUW(a, b) satuw((a) + (b))
  382 +#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
  383 +#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
  384 +
  385 +#define FSUB(a, b) ((a) - (b))
  386 +#define FSUBUB(a, b) satub((a) - (b))
  387 +#define FSUBUW(a, b) satuw((a) - (b))
  388 +#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
  389 +#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
  390 +#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
  391 +#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
  392 +#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
  393 +#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
  394 +
  395 +#define FAND(a, b) (a) & (b)
  396 +#define FANDN(a, b) ((~(a)) & (b))
  397 +#define FOR(a, b) (a) | (b)
  398 +#define FXOR(a, b) (a) ^ (b)
  399 +
  400 +#define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
  401 +#define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
  402 +#define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
  403 +#define FCMPEQ(a, b) (a) == (b) ? -1 : 0
  404 +
  405 +#define FMULLW(a, b) (a) * (b)
  406 +#define FMULHUW(a, b) (a) * (b) >> 16
  407 +#define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
  408 +
  409 +#define FAVG(a, b) ((a) + (b) + 1) >> 1
  410 +#endif
  411 +
  412 +SSE_OP_B(op_paddb, FADD)
  413 +SSE_OP_W(op_paddw, FADD)
  414 +SSE_OP_L(op_paddl, FADD)
  415 +SSE_OP_Q(op_paddq, FADD)
  416 +
  417 +SSE_OP_B(op_psubb, FSUB)
  418 +SSE_OP_W(op_psubw, FSUB)
  419 +SSE_OP_L(op_psubl, FSUB)
  420 +SSE_OP_Q(op_psubq, FSUB)
  421 +
  422 +SSE_OP_B(op_paddusb, FADDUB)
  423 +SSE_OP_B(op_paddsb, FADDSB)
  424 +SSE_OP_B(op_psubusb, FSUBUB)
  425 +SSE_OP_B(op_psubsb, FSUBSB)
  426 +
  427 +SSE_OP_W(op_paddusw, FADDUW)
  428 +SSE_OP_W(op_paddsw, FADDSW)
  429 +SSE_OP_W(op_psubusw, FSUBUW)
  430 +SSE_OP_W(op_psubsw, FSUBSW)
  431 +
  432 +SSE_OP_B(op_pminub, FMINUB)
  433 +SSE_OP_B(op_pmaxub, FMAXUB)
  434 +
  435 +SSE_OP_W(op_pminsw, FMINSW)
  436 +SSE_OP_W(op_pmaxsw, FMAXSW)
  437 +
  438 +SSE_OP_Q(op_pand, FAND)
  439 +SSE_OP_Q(op_pandn, FANDN)
  440 +SSE_OP_Q(op_por, FOR)
  441 +SSE_OP_Q(op_pxor, FXOR)
  442 +
  443 +SSE_OP_B(op_pcmpgtb, FCMPGTB)
  444 +SSE_OP_W(op_pcmpgtw, FCMPGTW)
  445 +SSE_OP_L(op_pcmpgtl, FCMPGTL)
  446 +
  447 +SSE_OP_B(op_pcmpeqb, FCMPEQ)
  448 +SSE_OP_W(op_pcmpeqw, FCMPEQ)
  449 +SSE_OP_L(op_pcmpeql, FCMPEQ)
  450 +
  451 +SSE_OP_W(op_pmullw, FMULLW)
  452 +SSE_OP_W(op_pmulhuw, FMULHUW)
  453 +SSE_OP_W(op_pmulhw, FMULHW)
  454 +
  455 +SSE_OP_B(op_pavgb, FAVG)
  456 +SSE_OP_W(op_pavgw, FAVG)
  457 +
  458 +void OPPROTO glue(op_pmuludq, SUFFIX) (void)
  459 +{
  460 + Reg *d, *s;
  461 + d = (Reg *)((char *)env + PARAM1);
  462 + s = (Reg *)((char *)env + PARAM2);
  463 +
  464 + d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
  465 +#if SHIFT == 1
  466 + d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
  467 +#endif
  468 +}
  469 +
  470 +void OPPROTO glue(op_pmaddwd, SUFFIX) (void)
  471 +{
  472 + int i;
  473 + Reg *d, *s;
  474 + d = (Reg *)((char *)env + PARAM1);
  475 + s = (Reg *)((char *)env + PARAM2);
  476 +
  477 + for(i = 0; i < (2 << SHIFT); i++) {
  478 + d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) +
  479 + (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1);
  480 + }
  481 +}
  482 +
  483 +#if SHIFT == 0
  484 +static inline int abs1(int a)
  485 +{
  486 + if (a < 0)
  487 + return -a;
  488 + else
  489 + return a;
  490 +}
  491 +#endif
  492 +void OPPROTO glue(op_psadbw, SUFFIX) (void)
  493 +{
  494 + unsigned int val;
  495 + Reg *d, *s;
  496 + d = (Reg *)((char *)env + PARAM1);
  497 + s = (Reg *)((char *)env + PARAM2);
  498 +
  499 + val = 0;
  500 + val += abs1(d->B(0) - s->B(0));
  501 + val += abs1(d->B(1) - s->B(1));
  502 + val += abs1(d->B(2) - s->B(2));
  503 + val += abs1(d->B(3) - s->B(3));
  504 + val += abs1(d->B(4) - s->B(4));
  505 + val += abs1(d->B(5) - s->B(5));
  506 + val += abs1(d->B(6) - s->B(6));
  507 + val += abs1(d->B(7) - s->B(7));
  508 + d->Q(0) = val;
  509 +#if SHIFT == 1
  510 + val = 0;
  511 + val += abs1(d->B(8) - s->B(8));
  512 + val += abs1(d->B(9) - s->B(9));
  513 + val += abs1(d->B(10) - s->B(10));
  514 + val += abs1(d->B(11) - s->B(11));
  515 + val += abs1(d->B(12) - s->B(12));
  516 + val += abs1(d->B(13) - s->B(13));
  517 + val += abs1(d->B(14) - s->B(14));
  518 + val += abs1(d->B(15) - s->B(15));
  519 + d->Q(1) = val;
  520 +#endif
  521 +}
  522 +
  523 +void OPPROTO glue(op_maskmov, SUFFIX) (void)
  524 +{
  525 + int i;
  526 + Reg *d, *s;
  527 + d = (Reg *)((char *)env + PARAM1);
  528 + s = (Reg *)((char *)env + PARAM2);
  529 + for(i = 0; i < (8 << SHIFT); i++) {
  530 + if (s->B(i) & 0x80)
  531 + stb(A0, d->B(i));
  532 + }
  533 +}
  534 +
  535 +void OPPROTO glue(op_movl_mm_T0, SUFFIX) (void)
  536 +{
  537 + Reg *d;
  538 + d = (Reg *)((char *)env + PARAM1);
  539 + d->L(0) = T0;
  540 + d->L(1) = 0;
  541 +#if SHIFT == 1
  542 + d->Q(1) = 0;
  543 +#endif
  544 +}
  545 +
  546 +void OPPROTO glue(op_movl_T0_mm, SUFFIX) (void)
  547 +{
  548 + Reg *s;
  549 + s = (Reg *)((char *)env + PARAM1);
  550 + T0 = s->L(0);
  551 +}
  552 +
  553 +#if SHIFT == 0
  554 +void OPPROTO glue(op_pshufw, SUFFIX) (void)
  555 +{
  556 + Reg r, *d, *s;
  557 + int order;
  558 + d = (Reg *)((char *)env + PARAM1);
  559 + s = (Reg *)((char *)env + PARAM2);
  560 + order = PARAM3;
  561 + r.W(0) = s->W(order & 3);
  562 + r.W(1) = s->W((order >> 2) & 3);
  563 + r.W(2) = s->W((order >> 4) & 3);
  564 + r.W(3) = s->W((order >> 6) & 3);
  565 + *d = r;
  566 +}
  567 +#else
  568 +void OPPROTO op_shufpd(void)
  569 +{
  570 + Reg r, *d, *s;
  571 + int order;
  572 + d = (Reg *)((char *)env + PARAM1);
  573 + s = (Reg *)((char *)env + PARAM2);
  574 + order = PARAM3;
  575 + r.Q(0) = s->Q(order & 1);
  576 + r.Q(1) = s->Q((order >> 1) & 1);
  577 + *d = r;
  578 +}
  579 +
  580 +void OPPROTO glue(op_pshufd, SUFFIX) (void)
  581 +{
  582 + Reg r, *d, *s;
  583 + int order;
  584 + d = (Reg *)((char *)env + PARAM1);
  585 + s = (Reg *)((char *)env + PARAM2);
  586 + order = PARAM3;
  587 + r.L(0) = s->L(order & 3);
  588 + r.L(1) = s->L((order >> 2) & 3);
  589 + r.L(2) = s->L((order >> 4) & 3);
  590 + r.L(3) = s->L((order >> 6) & 3);
  591 + *d = r;
  592 +}
  593 +
  594 +void OPPROTO glue(op_pshuflw, SUFFIX) (void)
  595 +{
  596 + Reg r, *d, *s;
  597 + int order;
  598 + d = (Reg *)((char *)env + PARAM1);
  599 + s = (Reg *)((char *)env + PARAM2);
  600 + order = PARAM3;
  601 + r.W(0) = s->W(order & 3);
  602 + r.W(1) = s->W((order >> 2) & 3);
  603 + r.W(2) = s->W((order >> 4) & 3);
  604 + r.W(3) = s->W((order >> 6) & 3);
  605 + r.Q(1) = s->Q(1);
  606 + *d = r;
  607 +}
  608 +
  609 +void OPPROTO glue(op_pshufhw, SUFFIX) (void)
  610 +{
  611 + Reg r, *d, *s;
  612 + int order;
  613 + d = (Reg *)((char *)env + PARAM1);
  614 + s = (Reg *)((char *)env + PARAM2);
  615 + order = PARAM3;
  616 + r.Q(0) = s->Q(0);
  617 + r.W(4) = s->W(4 + (order & 3));
  618 + r.W(5) = s->W(4 + ((order >> 2) & 3));
  619 + r.W(6) = s->W(4 + ((order >> 4) & 3));
  620 + r.W(7) = s->W(4 + ((order >> 6) & 3));
  621 + *d = r;
  622 +}
  623 +#endif
  624 +
  625 +#if SHIFT == 1
  626 +/* FPU ops */
  627 +/* XXX: not accurate */
  628 +
  629 +#define SSE_OP_S(name, F)\
  630 +void OPPROTO op_ ## name ## ps (void)\
  631 +{\
  632 + Reg *d, *s;\
  633 + d = (Reg *)((char *)env + PARAM1);\
  634 + s = (Reg *)((char *)env + PARAM2);\
  635 + d->XMM_S(0) = F(d->XMM_S(0), s->XMM_S(0));\
  636 + d->XMM_S(1) = F(d->XMM_S(1), s->XMM_S(1));\
  637 + d->XMM_S(2) = F(d->XMM_S(2), s->XMM_S(2));\
  638 + d->XMM_S(3) = F(d->XMM_S(3), s->XMM_S(3));\
  639 +}\
  640 +\
  641 +void OPPROTO op_ ## name ## ss (void)\
  642 +{\
  643 + Reg *d, *s;\
  644 + d = (Reg *)((char *)env + PARAM1);\
  645 + s = (Reg *)((char *)env + PARAM2);\
  646 + d->XMM_S(0) = F(d->XMM_S(0), s->XMM_S(0));\
  647 +}\
  648 +void OPPROTO op_ ## name ## pd (void)\
  649 +{\
  650 + Reg *d, *s;\
  651 + d = (Reg *)((char *)env + PARAM1);\
  652 + s = (Reg *)((char *)env + PARAM2);\
  653 + d->XMM_D(0) = F(d->XMM_D(0), s->XMM_D(0));\
  654 + d->XMM_D(1) = F(d->XMM_D(1), s->XMM_D(1));\
  655 +}\
  656 +\
  657 +void OPPROTO op_ ## name ## sd (void)\
  658 +{\
  659 + Reg *d, *s;\
  660 + d = (Reg *)((char *)env + PARAM1);\
  661 + s = (Reg *)((char *)env + PARAM2);\
  662 + d->XMM_D(0) = F(d->XMM_D(0), s->XMM_D(0));\
  663 +}
  664 +
  665 +#define FPU_ADD(a, b) (a) + (b)
  666 +#define FPU_SUB(a, b) (a) - (b)
  667 +#define FPU_MUL(a, b) (a) * (b)
  668 +#define FPU_DIV(a, b) (a) / (b)
  669 +#define FPU_MIN(a, b) (a) < (b) ? (a) : (b)
  670 +#define FPU_MAX(a, b) (a) > (b) ? (a) : (b)
  671 +#define FPU_SQRT(a, b) sqrt(b)
  672 +
  673 +SSE_OP_S(add, FPU_ADD)
  674 +SSE_OP_S(sub, FPU_SUB)
  675 +SSE_OP_S(mul, FPU_MUL)
  676 +SSE_OP_S(div, FPU_DIV)
  677 +SSE_OP_S(min, FPU_MIN)
  678 +SSE_OP_S(max, FPU_MAX)
  679 +SSE_OP_S(sqrt, FPU_SQRT)
  680 +
  681 +
  682 +/* float to float conversions */
  683 +void OPPROTO op_cvtps2pd(void)
  684 +{
  685 + float s0, s1;
  686 + Reg *d, *s;
  687 + d = (Reg *)((char *)env + PARAM1);
  688 + s = (Reg *)((char *)env + PARAM2);
  689 + s0 = s->XMM_S(0);
  690 + s1 = s->XMM_S(1);
  691 + d->XMM_D(0) = s0;
  692 + d->XMM_D(1) = s1;
  693 +}
  694 +
  695 +void OPPROTO op_cvtpd2ps(void)
  696 +{
  697 + Reg *d, *s;
  698 + d = (Reg *)((char *)env + PARAM1);
  699 + s = (Reg *)((char *)env + PARAM2);
  700 + d->XMM_S(0) = s->XMM_D(0);
  701 + d->XMM_S(1) = s->XMM_D(1);
  702 + d->Q(1) = 0;
  703 +}
  704 +
  705 +void OPPROTO op_cvtss2sd(void)
  706 +{
  707 + Reg *d, *s;
  708 + d = (Reg *)((char *)env + PARAM1);
  709 + s = (Reg *)((char *)env + PARAM2);
  710 + d->XMM_D(0) = s->XMM_S(0);
  711 +}
  712 +
  713 +void OPPROTO op_cvtsd2ss(void)
  714 +{
  715 + Reg *d, *s;
  716 + d = (Reg *)((char *)env + PARAM1);
  717 + s = (Reg *)((char *)env + PARAM2);
  718 + d->XMM_S(0) = s->XMM_D(0);
  719 +}
  720 +
  721 +/* integer to float */
  722 +void OPPROTO op_cvtdq2ps(void)
  723 +{
  724 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  725 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  726 + d->XMM_S(0) = (int32_t)s->XMM_L(0);
  727 + d->XMM_S(1) = (int32_t)s->XMM_L(1);
  728 + d->XMM_S(2) = (int32_t)s->XMM_L(2);
  729 + d->XMM_S(3) = (int32_t)s->XMM_L(3);
  730 +}
  731 +
  732 +void OPPROTO op_cvtdq2pd(void)
  733 +{
  734 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  735 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  736 + int32_t l0, l1;
  737 + l0 = (int32_t)s->XMM_L(0);
  738 + l1 = (int32_t)s->XMM_L(1);
  739 + d->XMM_D(0) = l0;
  740 + d->XMM_D(1) = l1;
  741 +}
  742 +
  743 +void OPPROTO op_cvtpi2ps(void)
  744 +{
  745 + XMMReg *d = (Reg *)((char *)env + PARAM1);
  746 + MMXReg *s = (MMXReg *)((char *)env + PARAM2);
  747 + d->XMM_S(0) = (int32_t)s->MMX_L(0);
  748 + d->XMM_S(1) = (int32_t)s->MMX_L(1);
  749 +}
  750 +
  751 +void OPPROTO op_cvtpi2pd(void)
  752 +{
  753 + XMMReg *d = (Reg *)((char *)env + PARAM1);
  754 + MMXReg *s = (MMXReg *)((char *)env + PARAM2);
  755 + d->XMM_D(0) = (int32_t)s->MMX_L(0);
  756 + d->XMM_D(1) = (int32_t)s->MMX_L(1);
  757 +}
  758 +
  759 +void OPPROTO op_cvtsi2ss(void)
  760 +{
  761 + XMMReg *d = (Reg *)((char *)env + PARAM1);
  762 + d->XMM_S(0) = (int32_t)T0;
  763 +}
  764 +
  765 +void OPPROTO op_cvtsi2sd(void)
  766 +{
  767 + XMMReg *d = (Reg *)((char *)env + PARAM1);
  768 + d->XMM_D(0) = (int32_t)T0;
  769 +}
  770 +
  771 +#ifdef TARGET_X86_64
  772 +void OPPROTO op_cvtsq2ss(void)
  773 +{
  774 + XMMReg *d = (Reg *)((char *)env + PARAM1);
  775 + d->XMM_S(0) = (int64_t)T0;
  776 +}
  777 +
  778 +void OPPROTO op_cvtsq2sd(void)
  779 +{
  780 + XMMReg *d = (Reg *)((char *)env + PARAM1);
  781 + d->XMM_D(0) = (int64_t)T0;
  782 +}
  783 +#endif
  784 +
  785 +/* float to integer */
  786 +void OPPROTO op_cvtps2dq(void)
  787 +{
  788 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  789 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  790 + d->XMM_L(0) = lrint(s->XMM_S(0));
  791 + d->XMM_L(1) = lrint(s->XMM_S(1));
  792 + d->XMM_L(2) = lrint(s->XMM_S(2));
  793 + d->XMM_L(3) = lrint(s->XMM_S(3));
  794 +}
  795 +
  796 +void OPPROTO op_cvtpd2dq(void)
  797 +{
  798 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  799 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  800 + d->XMM_L(0) = lrint(s->XMM_D(0));
  801 + d->XMM_L(1) = lrint(s->XMM_D(1));
  802 + d->XMM_Q(1) = 0;
  803 +}
  804 +
  805 +void OPPROTO op_cvtps2pi(void)
  806 +{
  807 + MMXReg *d = (MMXReg *)((char *)env + PARAM1);
  808 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  809 + d->MMX_L(0) = lrint(s->XMM_S(0));
  810 + d->MMX_L(1) = lrint(s->XMM_S(1));
  811 +}
  812 +
  813 +void OPPROTO op_cvtpd2pi(void)
  814 +{
  815 + MMXReg *d = (MMXReg *)((char *)env + PARAM1);
  816 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  817 + d->MMX_L(0) = lrint(s->XMM_D(0));
  818 + d->MMX_L(1) = lrint(s->XMM_D(1));
  819 +}
  820 +
  821 +void OPPROTO op_cvtss2si(void)
  822 +{
  823 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  824 + T0 = (int32_t)lrint(s->XMM_S(0));
  825 +}
  826 +
  827 +void OPPROTO op_cvtsd2si(void)
  828 +{
  829 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  830 + T0 = (int32_t)lrint(s->XMM_D(0));
  831 +}
  832 +
  833 +#ifdef TARGET_X86_64
  834 +void OPPROTO op_cvtss2sq(void)
  835 +{
  836 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  837 + T0 = llrint(s->XMM_S(0));
  838 +}
  839 +
  840 +void OPPROTO op_cvtsd2sq(void)
  841 +{
  842 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  843 + T0 = llrint(s->XMM_D(0));
  844 +}
  845 +#endif
  846 +
  847 +/* float to integer truncated */
  848 +void OPPROTO op_cvttps2dq(void)
  849 +{
  850 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  851 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  852 + d->XMM_L(0) = (int32_t)s->XMM_S(0);
  853 + d->XMM_L(1) = (int32_t)s->XMM_S(1);
  854 + d->XMM_L(2) = (int32_t)s->XMM_S(2);
  855 + d->XMM_L(3) = (int32_t)s->XMM_S(3);
  856 +}
  857 +
  858 +void OPPROTO op_cvttpd2dq(void)
  859 +{
  860 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  861 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  862 + d->XMM_L(0) = (int32_t)s->XMM_D(0);
  863 + d->XMM_L(1) = (int32_t)s->XMM_D(1);
  864 + d->XMM_Q(1) = 0;
  865 +}
  866 +
  867 +void OPPROTO op_cvttps2pi(void)
  868 +{
  869 + MMXReg *d = (MMXReg *)((char *)env + PARAM1);
  870 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  871 + d->MMX_L(0) = (int32_t)(s->XMM_S(0));
  872 + d->MMX_L(1) = (int32_t)(s->XMM_S(1));
  873 +}
  874 +
  875 +void OPPROTO op_cvttpd2pi(void)
  876 +{
  877 + MMXReg *d = (MMXReg *)((char *)env + PARAM1);
  878 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  879 + d->MMX_L(0) = (int32_t)(s->XMM_D(0));
  880 + d->MMX_L(1) = (int32_t)(s->XMM_D(1));
  881 +}
  882 +
  883 +void OPPROTO op_cvttss2si(void)
  884 +{
  885 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  886 + T0 = (int32_t)(s->XMM_S(0));
  887 +}
  888 +
  889 +void OPPROTO op_cvttsd2si(void)
  890 +{
  891 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  892 + T0 = (int32_t)(s->XMM_D(0));
  893 +}
  894 +
  895 +#ifdef TARGET_X86_64
  896 +void OPPROTO op_cvttss2sq(void)
  897 +{
  898 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  899 + T0 = (int64_t)(s->XMM_S(0));
  900 +}
  901 +
  902 +void OPPROTO op_cvttsd2sq(void)
  903 +{
  904 + XMMReg *s = (XMMReg *)((char *)env + PARAM1);
  905 + T0 = (int64_t)(s->XMM_D(0));
  906 +}
  907 +#endif
  908 +
  909 +void OPPROTO op_rsqrtps(void)
  910 +{
  911 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  912 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  913 + d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
  914 + d->XMM_S(1) = approx_rsqrt(s->XMM_S(1));
  915 + d->XMM_S(2) = approx_rsqrt(s->XMM_S(2));
  916 + d->XMM_S(3) = approx_rsqrt(s->XMM_S(3));
  917 +}
  918 +
  919 +void OPPROTO op_rsqrtss(void)
  920 +{
  921 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  922 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  923 + d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
  924 +}
  925 +
  926 +void OPPROTO op_rcpps(void)
  927 +{
  928 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  929 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  930 + d->XMM_S(0) = approx_rcp(s->XMM_S(0));
  931 + d->XMM_S(1) = approx_rcp(s->XMM_S(1));
  932 + d->XMM_S(2) = approx_rcp(s->XMM_S(2));
  933 + d->XMM_S(3) = approx_rcp(s->XMM_S(3));
  934 +}
  935 +
  936 +void OPPROTO op_rcpss(void)
  937 +{
  938 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  939 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  940 + d->XMM_S(0) = approx_rcp(s->XMM_S(0));
  941 +}
  942 +
  943 +void OPPROTO op_haddps(void)
  944 +{
  945 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  946 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  947 + XMMReg r;
  948 + r.XMM_S(0) = d->XMM_S(0) + d->XMM_S(1);
  949 + r.XMM_S(1) = d->XMM_S(2) + d->XMM_S(3);
  950 + r.XMM_S(2) = s->XMM_S(0) + s->XMM_S(1);
  951 + r.XMM_S(3) = s->XMM_S(2) + s->XMM_S(3);
  952 + *d = r;
  953 +}
  954 +
  955 +void OPPROTO op_haddpd(void)
  956 +{
  957 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  958 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  959 + XMMReg r;
  960 + r.XMM_D(0) = d->XMM_D(0) + d->XMM_D(1);
  961 + r.XMM_D(1) = s->XMM_D(0) + s->XMM_D(1);
  962 + *d = r;
  963 +}
  964 +
  965 +void OPPROTO op_hsubps(void)
  966 +{
  967 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  968 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  969 + XMMReg r;
  970 + r.XMM_S(0) = d->XMM_S(0) - d->XMM_S(1);
  971 + r.XMM_S(1) = d->XMM_S(2) - d->XMM_S(3);
  972 + r.XMM_S(2) = s->XMM_S(0) - s->XMM_S(1);
  973 + r.XMM_S(3) = s->XMM_S(2) - s->XMM_S(3);
  974 + *d = r;
  975 +}
  976 +
  977 +void OPPROTO op_hsubpd(void)
  978 +{
  979 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  980 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  981 + XMMReg r;
  982 + r.XMM_D(0) = d->XMM_D(0) - d->XMM_D(1);
  983 + r.XMM_D(1) = s->XMM_D(0) - s->XMM_D(1);
  984 + *d = r;
  985 +}
  986 +
  987 +void OPPROTO op_addsubps(void)
  988 +{
  989 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  990 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  991 + d->XMM_S(0) = d->XMM_S(0) - s->XMM_S(0);
  992 + d->XMM_S(1) = d->XMM_S(1) + s->XMM_S(1);
  993 + d->XMM_S(2) = d->XMM_S(2) - s->XMM_S(2);
  994 + d->XMM_S(3) = d->XMM_S(3) + s->XMM_S(3);
  995 +}
  996 +
  997 +void OPPROTO op_addsubpd(void)
  998 +{
  999 + XMMReg *d = (XMMReg *)((char *)env + PARAM1);
  1000 + XMMReg *s = (XMMReg *)((char *)env + PARAM2);
  1001 + d->XMM_D(0) = d->XMM_D(0) - s->XMM_D(0);
  1002 + d->XMM_D(1) = d->XMM_D(1) + s->XMM_D(1);
  1003 +}
  1004 +
  1005 +/* XXX: unordered */
  1006 +#define SSE_OP_CMP(name, F)\
  1007 +void OPPROTO op_ ## name ## ps (void)\
  1008 +{\
  1009 + Reg *d, *s;\
  1010 + d = (Reg *)((char *)env + PARAM1);\
  1011 + s = (Reg *)((char *)env + PARAM2);\
  1012 + d->XMM_L(0) = F(d->XMM_S(0), s->XMM_S(0));\
  1013 + d->XMM_L(1) = F(d->XMM_S(1), s->XMM_S(1));\
  1014 + d->XMM_L(2) = F(d->XMM_S(2), s->XMM_S(2));\
  1015 + d->XMM_L(3) = F(d->XMM_S(3), s->XMM_S(3));\
  1016 +}\
  1017 +\
  1018 +void OPPROTO op_ ## name ## ss (void)\
  1019 +{\
  1020 + Reg *d, *s;\
  1021 + d = (Reg *)((char *)env + PARAM1);\
  1022 + s = (Reg *)((char *)env + PARAM2);\
  1023 + d->XMM_L(0) = F(d->XMM_S(0), s->XMM_S(0));\
  1024 +}\
  1025 +void OPPROTO op_ ## name ## pd (void)\
  1026 +{\
  1027 + Reg *d, *s;\
  1028 + d = (Reg *)((char *)env + PARAM1);\
  1029 + s = (Reg *)((char *)env + PARAM2);\
  1030 + d->XMM_Q(0) = F(d->XMM_D(0), s->XMM_D(0));\
  1031 + d->XMM_Q(1) = F(d->XMM_D(1), s->XMM_D(1));\
  1032 +}\
  1033 +\
  1034 +void OPPROTO op_ ## name ## sd (void)\
  1035 +{\
  1036 + Reg *d, *s;\
  1037 + d = (Reg *)((char *)env + PARAM1);\
  1038 + s = (Reg *)((char *)env + PARAM2);\
  1039 + d->XMM_Q(0) = F(d->XMM_D(0), s->XMM_D(0));\
  1040 +}
  1041 +
  1042 +#define FPU_CMPEQ(a, b) (a) == (b) ? -1 : 0
  1043 +#define FPU_CMPLT(a, b) (a) < (b) ? -1 : 0
  1044 +#define FPU_CMPLE(a, b) (a) <= (b) ? -1 : 0
  1045 +#define FPU_CMPUNORD(a, b) (fpu_isnan(a) || fpu_isnan(b)) ? - 1 : 0
  1046 +#define FPU_CMPNEQ(a, b) (a) == (b) ? 0 : -1
  1047 +#define FPU_CMPNLT(a, b) (a) < (b) ? 0 : -1
  1048 +#define FPU_CMPNLE(a, b) (a) <= (b) ? 0 : -1
  1049 +#define FPU_CMPORD(a, b) (!fpu_isnan(a) && !fpu_isnan(b)) ? - 1 : 0
  1050 +
  1051 +SSE_OP_CMP(cmpeq, FPU_CMPEQ)
  1052 +SSE_OP_CMP(cmplt, FPU_CMPLT)
  1053 +SSE_OP_CMP(cmple, FPU_CMPLE)
  1054 +SSE_OP_CMP(cmpunord, FPU_CMPUNORD)
  1055 +SSE_OP_CMP(cmpneq, FPU_CMPNEQ)
  1056 +SSE_OP_CMP(cmpnlt, FPU_CMPNLT)
  1057 +SSE_OP_CMP(cmpnle, FPU_CMPNLE)
  1058 +SSE_OP_CMP(cmpord, FPU_CMPORD)
  1059 +
  1060 +void OPPROTO op_ucomiss(void)
  1061 +{
  1062 + int eflags;
  1063 + float s0, s1;
  1064 + Reg *d, *s;
  1065 + d = (Reg *)((char *)env + PARAM1);
  1066 + s = (Reg *)((char *)env + PARAM2);
  1067 +
  1068 + s0 = d->XMM_S(0);
  1069 + s1 = s->XMM_S(0);
  1070 + if (s0 < s1)
  1071 + eflags = CC_C;
  1072 + else if (s0 == s1)
  1073 + eflags = CC_Z;
  1074 + else
  1075 + eflags = 0;
  1076 + CC_SRC = eflags;
  1077 + FORCE_RET();
  1078 +}
  1079 +
  1080 +void OPPROTO op_comiss(void)
  1081 +{
  1082 + int eflags;
  1083 + float s0, s1;
  1084 + Reg *d, *s;
  1085 + d = (Reg *)((char *)env + PARAM1);
  1086 + s = (Reg *)((char *)env + PARAM2);
  1087 +
  1088 + s0 = d->XMM_S(0);
  1089 + s1 = s->XMM_S(0);
  1090 + if (s0 < s1)
  1091 + eflags = CC_C;
  1092 + else if (s0 == s1)
  1093 + eflags = CC_Z;
  1094 + else
  1095 + eflags = 0;
  1096 + CC_SRC = eflags;
  1097 + FORCE_RET();
  1098 +}
  1099 +
  1100 +void OPPROTO op_ucomisd(void)
  1101 +{
  1102 + int eflags;
  1103 + double d0, d1;
  1104 + Reg *d, *s;
  1105 + d = (Reg *)((char *)env + PARAM1);
  1106 + s = (Reg *)((char *)env + PARAM2);
  1107 +
  1108 + d0 = d->XMM_D(0);
  1109 + d1 = s->XMM_D(0);
  1110 + if (d0 < d1)
  1111 + eflags = CC_C;
  1112 + else if (d0 == d1)
  1113 + eflags = CC_Z;
  1114 + else
  1115 + eflags = 0;
  1116 + CC_SRC = eflags;
  1117 + FORCE_RET();
  1118 +}
  1119 +
  1120 +void OPPROTO op_comisd(void)
  1121 +{
  1122 + int eflags;
  1123 + double d0, d1;
  1124 + Reg *d, *s;
  1125 + d = (Reg *)((char *)env + PARAM1);
  1126 + s = (Reg *)((char *)env + PARAM2);
  1127 +
  1128 + d0 = d->XMM_D(0);
  1129 + d1 = s->XMM_D(0);
  1130 + if (d0 < d1)
  1131 + eflags = CC_C;
  1132 + else if (d0 == d1)
  1133 + eflags = CC_Z;
  1134 + else
  1135 + eflags = 0;
  1136 + CC_SRC = eflags;
  1137 + FORCE_RET();
  1138 +}
  1139 +
  1140 +void OPPROTO op_movmskps(void)
  1141 +{
  1142 + int b0, b1, b2, b3;
  1143 + Reg *s;
  1144 + s = (Reg *)((char *)env + PARAM1);
  1145 + b0 = s->XMM_L(0) >> 31;
  1146 + b1 = s->XMM_L(1) >> 31;
  1147 + b2 = s->XMM_L(2) >> 31;
  1148 + b3 = s->XMM_L(3) >> 31;
  1149 + T0 = b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
  1150 +}
  1151 +
  1152 +void OPPROTO op_movmskpd(void)
  1153 +{
  1154 + int b0, b1;
  1155 + Reg *s;
  1156 + s = (Reg *)((char *)env + PARAM1);
  1157 + b0 = s->XMM_L(1) >> 31;
  1158 + b1 = s->XMM_L(3) >> 31;
  1159 + T0 = b0 | (b1 << 1);
  1160 +}
  1161 +
  1162 +#endif
  1163 +
  1164 +void OPPROTO glue(op_pmovmskb, SUFFIX)(void)
  1165 +{
  1166 + Reg *s;
  1167 + s = (Reg *)((char *)env + PARAM1);
  1168 + T0 = 0;
  1169 + T0 |= (s->XMM_B(0) >> 7);
  1170 + T0 |= (s->XMM_B(1) >> 6) & 0x02;
  1171 + T0 |= (s->XMM_B(2) >> 5) & 0x04;
  1172 + T0 |= (s->XMM_B(3) >> 4) & 0x08;
  1173 + T0 |= (s->XMM_B(4) >> 3) & 0x10;
  1174 + T0 |= (s->XMM_B(5) >> 2) & 0x20;
  1175 + T0 |= (s->XMM_B(6) >> 1) & 0x40;
  1176 + T0 |= (s->XMM_B(7)) & 0x80;
  1177 +#if SHIFT == 1
  1178 + T0 |= (s->XMM_B(8) << 1) & 0x0100;
  1179 + T0 |= (s->XMM_B(9) << 2) & 0x0200;
  1180 + T0 |= (s->XMM_B(10) << 3) & 0x0400;
  1181 + T0 |= (s->XMM_B(11) << 4) & 0x0800;
  1182 + T0 |= (s->XMM_B(12) << 5) & 0x1000;
  1183 + T0 |= (s->XMM_B(13) << 6) & 0x2000;
  1184 + T0 |= (s->XMM_B(14) << 7) & 0x4000;
  1185 + T0 |= (s->XMM_B(15) << 8) & 0x8000;
  1186 +#endif
  1187 +}
  1188 +
  1189 +void OPPROTO glue(op_pinsrw, SUFFIX) (void)
  1190 +{
  1191 + Reg *d = (Reg *)((char *)env + PARAM1);
  1192 + int pos = PARAM2;
  1193 +
  1194 + d->W(pos) = T0;
  1195 +}
  1196 +
  1197 +void OPPROTO glue(op_pextrw, SUFFIX) (void)
  1198 +{
  1199 + Reg *s = (Reg *)((char *)env + PARAM1);
  1200 + int pos = PARAM2;
  1201 +
  1202 + T0 = s->W(pos);
  1203 +}
  1204 +
  1205 +void OPPROTO glue(op_packsswb, SUFFIX) (void)
  1206 +{
  1207 + Reg r, *d, *s;
  1208 + d = (Reg *)((char *)env + PARAM1);
  1209 + s = (Reg *)((char *)env + PARAM2);
  1210 +
  1211 + r.B(0) = satsb((int16_t)d->W(0));
  1212 + r.B(1) = satsb((int16_t)d->W(1));
  1213 + r.B(2) = satsb((int16_t)d->W(2));
  1214 + r.B(3) = satsb((int16_t)d->W(3));
  1215 +#if SHIFT == 1
  1216 + r.B(4) = satsb((int16_t)d->W(4));
  1217 + r.B(5) = satsb((int16_t)d->W(5));
  1218 + r.B(6) = satsb((int16_t)d->W(6));
  1219 + r.B(7) = satsb((int16_t)d->W(7));
  1220 +#endif
  1221 + r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
  1222 + r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
  1223 + r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
  1224 + r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
  1225 +#if SHIFT == 1
  1226 + r.B(12) = satsb((int16_t)s->W(4));
  1227 + r.B(13) = satsb((int16_t)s->W(5));
  1228 + r.B(14) = satsb((int16_t)s->W(6));
  1229 + r.B(15) = satsb((int16_t)s->W(7));
  1230 +#endif
  1231 + *d = r;
  1232 +}
  1233 +
  1234 +void OPPROTO glue(op_packuswb, SUFFIX) (void)
  1235 +{
  1236 + Reg r, *d, *s;
  1237 + d = (Reg *)((char *)env + PARAM1);
  1238 + s = (Reg *)((char *)env + PARAM2);
  1239 +
  1240 + r.B(0) = satub((int16_t)d->W(0));
  1241 + r.B(1) = satub((int16_t)d->W(1));
  1242 + r.B(2) = satub((int16_t)d->W(2));
  1243 + r.B(3) = satub((int16_t)d->W(3));
  1244 +#if SHIFT == 1
  1245 + r.B(4) = satub((int16_t)d->W(4));
  1246 + r.B(5) = satub((int16_t)d->W(5));
  1247 + r.B(6) = satub((int16_t)d->W(6));
  1248 + r.B(7) = satub((int16_t)d->W(7));
  1249 +#endif
  1250 + r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
  1251 + r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
  1252 + r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
  1253 + r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
  1254 +#if SHIFT == 1
  1255 + r.B(12) = satub((int16_t)s->W(4));
  1256 + r.B(13) = satub((int16_t)s->W(5));
  1257 + r.B(14) = satub((int16_t)s->W(6));
  1258 + r.B(15) = satub((int16_t)s->W(7));
  1259 +#endif
  1260 + *d = r;
  1261 +}
  1262 +
  1263 +void OPPROTO glue(op_packssdw, SUFFIX) (void)
  1264 +{
  1265 + Reg r, *d, *s;
  1266 + d = (Reg *)((char *)env + PARAM1);
  1267 + s = (Reg *)((char *)env + PARAM2);
  1268 +
  1269 + r.W(0) = satsw(d->L(0));
  1270 + r.W(1) = satsw(d->L(1));
  1271 +#if SHIFT == 1
  1272 + r.W(2) = satsw(d->L(2));
  1273 + r.W(3) = satsw(d->L(3));
  1274 +#endif
  1275 + r.W((2 << SHIFT) + 0) = satsw(s->L(0));
  1276 + r.W((2 << SHIFT) + 1) = satsw(s->L(1));
  1277 +#if SHIFT == 1
  1278 + r.W(6) = satsw(s->L(2));
  1279 + r.W(7) = satsw(s->L(3));
  1280 +#endif
  1281 + *d = r;
  1282 +}
  1283 +
  1284 +#define UNPCK_OP(base_name, base) \
  1285 + \
  1286 +void OPPROTO glue(op_punpck ## base_name ## bw, SUFFIX) (void) \
  1287 +{ \
  1288 + Reg r, *d, *s; \
  1289 + d = (Reg *)((char *)env + PARAM1); \
  1290 + s = (Reg *)((char *)env + PARAM2); \
  1291 + \
  1292 + r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
  1293 + r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
  1294 + r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
  1295 + r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
  1296 + r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
  1297 + r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
  1298 + r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
  1299 + r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
  1300 +XMM_ONLY( \
  1301 + r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
  1302 + r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
  1303 + r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
  1304 + r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
  1305 + r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
  1306 + r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
  1307 + r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
  1308 + r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
  1309 +) \
  1310 + *d = r; \
  1311 +} \
  1312 + \
  1313 +void OPPROTO glue(op_punpck ## base_name ## wd, SUFFIX) (void) \
  1314 +{ \
  1315 + Reg r, *d, *s; \
  1316 + d = (Reg *)((char *)env + PARAM1); \
  1317 + s = (Reg *)((char *)env + PARAM2); \
  1318 + \
  1319 + r.W(0) = d->W((base << (SHIFT + 1)) + 0); \
  1320 + r.W(1) = s->W((base << (SHIFT + 1)) + 0); \
  1321 + r.W(2) = d->W((base << (SHIFT + 1)) + 1); \
  1322 + r.W(3) = s->W((base << (SHIFT + 1)) + 1); \
  1323 +XMM_ONLY( \
  1324 + r.W(4) = d->W((base << (SHIFT + 1)) + 2); \
  1325 + r.W(5) = s->W((base << (SHIFT + 1)) + 2); \
  1326 + r.W(6) = d->W((base << (SHIFT + 1)) + 3); \
  1327 + r.W(7) = s->W((base << (SHIFT + 1)) + 3); \
  1328 +) \
  1329 + *d = r; \
  1330 +} \
  1331 + \
  1332 +void OPPROTO glue(op_punpck ## base_name ## dq, SUFFIX) (void) \
  1333 +{ \
  1334 + Reg r, *d, *s; \
  1335 + d = (Reg *)((char *)env + PARAM1); \
  1336 + s = (Reg *)((char *)env + PARAM2); \
  1337 + \
  1338 + r.L(0) = d->L((base << SHIFT) + 0); \
  1339 + r.L(1) = s->L((base << SHIFT) + 0); \
  1340 +XMM_ONLY( \
  1341 + r.L(2) = d->L((base << SHIFT) + 1); \
  1342 + r.L(3) = s->L((base << SHIFT) + 1); \
  1343 +) \
  1344 + *d = r; \
  1345 +} \
  1346 + \
  1347 +XMM_ONLY( \
  1348 +void OPPROTO glue(op_punpck ## base_name ## qdq, SUFFIX) (void) \
  1349 +{ \
  1350 + Reg r, *d, *s; \
  1351 + d = (Reg *)((char *)env + PARAM1); \
  1352 + s = (Reg *)((char *)env + PARAM2); \
  1353 + \
  1354 + r.Q(0) = d->Q(base); \
  1355 + r.Q(1) = s->Q(base); \
  1356 + *d = r; \
  1357 +} \
  1358 +)
  1359 +
  1360 +UNPCK_OP(l, 0)
  1361 +UNPCK_OP(h, 1)
  1362 +
  1363 +#undef SHIFT
  1364 +#undef XMM_ONLY
  1365 +#undef Reg
  1366 +#undef B
  1367 +#undef W
  1368 +#undef L
  1369 +#undef Q
  1370 +#undef SUFFIX
... ...
target-i386/translate.c
... ... @@ -1606,6 +1606,23 @@ static void gen_lea_modrm(DisasContext *s, int modrm, int *reg_ptr, int *offset_
1606 1606 *offset_ptr = disp;
1607 1607 }
1608 1608  
  1609 +/* used for LEA and MOV AX, mem */
  1610 +static void gen_add_A0_ds_seg(DisasContext *s)
  1611 +{
  1612 + int override, must_add_seg;
  1613 + must_add_seg = s->addseg;
  1614 + override = R_DS;
  1615 + if (s->override >= 0) {
  1616 + override = s->override;
  1617 + must_add_seg = 1;
  1618 + } else {
  1619 + override = R_DS;
  1620 + }
  1621 + if (must_add_seg) {
  1622 + gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
  1623 + }
  1624 +}
  1625 +
1609 1626 /* generate modrm memory load or store of 'reg'. TMP0 is used if reg !=
1610 1627 OR_TMP0 */
1611 1628 static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store)
... ... @@ -2193,6 +2210,22 @@ static void gen_movtl_T0_im(target_ulong val)
2193 2210 #endif
2194 2211 }
2195 2212  
  2213 +static GenOpFunc1 *gen_ldq_env_A0[3] = {
  2214 + gen_op_ldq_raw_env_A0,
  2215 +#ifndef CONFIG_USER_ONLY
  2216 + gen_op_ldq_kernel_env_A0,
  2217 + gen_op_ldq_user_env_A0,
  2218 +#endif
  2219 +};
  2220 +
  2221 +static GenOpFunc1 *gen_stq_env_A0[3] = {
  2222 + gen_op_stq_raw_env_A0,
  2223 +#ifndef CONFIG_USER_ONLY
  2224 + gen_op_stq_kernel_env_A0,
  2225 + gen_op_stq_user_env_A0,
  2226 +#endif
  2227 +};
  2228 +
2196 2229 static GenOpFunc1 *gen_ldo_env_A0[3] = {
2197 2230 gen_op_ldo_raw_env_A0,
2198 2231 #ifndef CONFIG_USER_ONLY
... ... @@ -2209,6 +2242,693 @@ static GenOpFunc1 *gen_sto_env_A0[3] = {
2209 2242 #endif
2210 2243 };
2211 2244  
  2245 +#define SSE_SPECIAL ((GenOpFunc2 *)1)
  2246 +
  2247 +#define MMX_OP2(x) { gen_op_ ## x ## _mmx, gen_op_ ## x ## _xmm }
  2248 +#define SSE_FOP(x) { gen_op_ ## x ## ps, gen_op_ ## x ## pd, \
  2249 + gen_op_ ## x ## ss, gen_op_ ## x ## sd, }
  2250 +
  2251 +static GenOpFunc2 *sse_op_table1[256][4] = {
  2252 + /* pure SSE operations */
  2253 + [0x10] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */
  2254 + [0x11] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */
  2255 + [0x12] = { SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd */
  2256 + [0x13] = { SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd */
  2257 + [0x14] = { gen_op_punpckldq_xmm, gen_op_punpcklqdq_xmm },
  2258 + [0x15] = { gen_op_punpckhdq_xmm, gen_op_punpckhqdq_xmm },
  2259 + [0x16] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movhpd, movshdup */
  2260 + [0x17] = { SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movhpd */
  2261 +
  2262 + [0x28] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */
  2263 + [0x29] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */
  2264 + [0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
  2265 + [0x2b] = { SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd */
  2266 + [0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
  2267 + [0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
  2268 + [0x2e] = { gen_op_ucomiss, gen_op_ucomisd },
  2269 + [0x2f] = { gen_op_comiss, gen_op_comisd },
  2270 + [0x50] = { SSE_SPECIAL, SSE_SPECIAL }, /* movmskps, movmskpd */
  2271 + [0x51] = SSE_FOP(sqrt),
  2272 + [0x52] = { gen_op_rsqrtps, NULL, gen_op_rsqrtss, NULL },
  2273 + [0x53] = { gen_op_rcpps, NULL, gen_op_rcpss, NULL },
  2274 + [0x54] = { gen_op_pand_xmm, gen_op_pand_xmm }, /* andps, andpd */
  2275 + [0x55] = { gen_op_pandn_xmm, gen_op_pandn_xmm }, /* andnps, andnpd */
  2276 + [0x56] = { gen_op_por_xmm, gen_op_por_xmm }, /* orps, orpd */
  2277 + [0x57] = { gen_op_pxor_xmm, gen_op_pxor_xmm }, /* xorps, xorpd */
  2278 + [0x58] = SSE_FOP(add),
  2279 + [0x59] = SSE_FOP(mul),
  2280 + [0x5a] = { gen_op_cvtps2pd, gen_op_cvtpd2ps,
  2281 + gen_op_cvtss2sd, gen_op_cvtsd2ss },
  2282 + [0x5b] = { gen_op_cvtdq2ps, gen_op_cvtps2dq, gen_op_cvttps2dq },
  2283 + [0x5c] = SSE_FOP(sub),
  2284 + [0x5d] = SSE_FOP(min),
  2285 + [0x5e] = SSE_FOP(div),
  2286 + [0x5f] = SSE_FOP(max),
  2287 +
  2288 + [0xc2] = SSE_FOP(cmpeq),
  2289 + [0xc6] = { (GenOpFunc2 *)gen_op_pshufd_xmm, (GenOpFunc2 *)gen_op_shufpd },
  2290 +
  2291 + /* MMX ops and their SSE extensions */
  2292 + [0x60] = MMX_OP2(punpcklbw),
  2293 + [0x61] = MMX_OP2(punpcklwd),
  2294 + [0x62] = MMX_OP2(punpckldq),
  2295 + [0x63] = MMX_OP2(packsswb),
  2296 + [0x64] = MMX_OP2(pcmpgtb),
  2297 + [0x65] = MMX_OP2(pcmpgtw),
  2298 + [0x66] = MMX_OP2(pcmpgtl),
  2299 + [0x67] = MMX_OP2(packuswb),
  2300 + [0x68] = MMX_OP2(punpckhbw),
  2301 + [0x69] = MMX_OP2(punpckhwd),
  2302 + [0x6a] = MMX_OP2(punpckhdq),
  2303 + [0x6b] = MMX_OP2(packssdw),
  2304 + [0x6c] = { NULL, gen_op_punpcklqdq_xmm },
  2305 + [0x6d] = { NULL, gen_op_punpckhqdq_xmm },
  2306 + [0x6e] = { SSE_SPECIAL, SSE_SPECIAL }, /* movd mm, ea */
  2307 + [0x6f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, , movqdu */
  2308 + [0x70] = { (GenOpFunc2 *)gen_op_pshufw_mmx,
  2309 + (GenOpFunc2 *)gen_op_pshufd_xmm,
  2310 + (GenOpFunc2 *)gen_op_pshufhw_xmm,
  2311 + (GenOpFunc2 *)gen_op_pshuflw_xmm },
  2312 + [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */
  2313 + [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */
  2314 + [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */
  2315 + [0x74] = MMX_OP2(pcmpeqb),
  2316 + [0x75] = MMX_OP2(pcmpeqw),
  2317 + [0x76] = MMX_OP2(pcmpeql),
  2318 + [0x77] = { SSE_SPECIAL }, /* emms */
  2319 + [0x7c] = { NULL, gen_op_haddpd, NULL, gen_op_haddps },
  2320 + [0x7d] = { NULL, gen_op_hsubpd, NULL, gen_op_hsubps },
  2321 + [0x7e] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, , movq */
  2322 + [0x7f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, movdqu */
  2323 + [0xc4] = { SSE_SPECIAL, SSE_SPECIAL }, /* pinsrw */
  2324 + [0xc5] = { SSE_SPECIAL, SSE_SPECIAL }, /* pextrw */
  2325 + [0xd0] = { NULL, gen_op_addsubpd, NULL, gen_op_addsubps },
  2326 + [0xd1] = MMX_OP2(psrlw),
  2327 + [0xd2] = MMX_OP2(psrld),
  2328 + [0xd3] = MMX_OP2(psrlq),
  2329 + [0xd4] = MMX_OP2(paddq),
  2330 + [0xd5] = MMX_OP2(pmullw),
  2331 + [0xd6] = { NULL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
  2332 + [0xd7] = { SSE_SPECIAL, SSE_SPECIAL }, /* pmovmskb */
  2333 + [0xd8] = MMX_OP2(psubusb),
  2334 + [0xd9] = MMX_OP2(psubusw),
  2335 + [0xda] = MMX_OP2(pminub),
  2336 + [0xdb] = MMX_OP2(pand),
  2337 + [0xdc] = MMX_OP2(paddusb),
  2338 + [0xdd] = MMX_OP2(paddusw),
  2339 + [0xde] = MMX_OP2(pmaxub),
  2340 + [0xdf] = MMX_OP2(pandn),
  2341 + [0xe0] = MMX_OP2(pavgb),
  2342 + [0xe1] = MMX_OP2(psraw),
  2343 + [0xe2] = MMX_OP2(psrad),
  2344 + [0xe3] = MMX_OP2(pavgw),
  2345 + [0xe4] = MMX_OP2(pmulhuw),
  2346 + [0xe5] = MMX_OP2(pmulhw),
  2347 + [0xe6] = { NULL, gen_op_cvttpd2dq, gen_op_cvtdq2pd, gen_op_cvtpd2dq },
  2348 + [0xe7] = { SSE_SPECIAL , SSE_SPECIAL }, /* movntq, movntq */
  2349 + [0xe8] = MMX_OP2(psubsb),
  2350 + [0xe9] = MMX_OP2(psubsw),
  2351 + [0xea] = MMX_OP2(pminsw),
  2352 + [0xeb] = MMX_OP2(por),
  2353 + [0xec] = MMX_OP2(paddsb),
  2354 + [0xed] = MMX_OP2(paddsw),
  2355 + [0xee] = MMX_OP2(pmaxsw),
  2356 + [0xef] = MMX_OP2(pxor),
  2357 + [0xf0] = { NULL, NULL, NULL, SSE_SPECIAL }, /* lddqu (PNI) */
  2358 + [0xf1] = MMX_OP2(psllw),
  2359 + [0xf2] = MMX_OP2(pslld),
  2360 + [0xf3] = MMX_OP2(psllq),
  2361 + [0xf4] = MMX_OP2(pmuludq),
  2362 + [0xf5] = MMX_OP2(pmaddwd),
  2363 + [0xf6] = MMX_OP2(psadbw),
  2364 + [0xf7] = MMX_OP2(maskmov),
  2365 + [0xf8] = MMX_OP2(psubb),
  2366 + [0xf9] = MMX_OP2(psubw),
  2367 + [0xfa] = MMX_OP2(psubl),
  2368 + [0xfb] = MMX_OP2(psubq),
  2369 + [0xfc] = MMX_OP2(paddb),
  2370 + [0xfd] = MMX_OP2(paddw),
  2371 + [0xfe] = MMX_OP2(paddl),
  2372 +};
  2373 +
  2374 +static GenOpFunc2 *sse_op_table2[3 * 8][2] = {
  2375 + [0 + 2] = MMX_OP2(psrlw),
  2376 + [0 + 4] = MMX_OP2(psraw),
  2377 + [0 + 6] = MMX_OP2(psllw),
  2378 + [8 + 2] = MMX_OP2(psrld),
  2379 + [8 + 4] = MMX_OP2(psrad),
  2380 + [8 + 6] = MMX_OP2(pslld),
  2381 + [16 + 2] = MMX_OP2(psrlq),
  2382 + [16 + 3] = { NULL, gen_op_psrldq_xmm },
  2383 + [16 + 6] = MMX_OP2(psllq),
  2384 + [16 + 7] = { NULL, gen_op_pslldq_xmm },
  2385 +};
  2386 +
  2387 +static GenOpFunc1 *sse_op_table3[4 * 3] = {
  2388 + gen_op_cvtsi2ss,
  2389 + gen_op_cvtsi2sd,
  2390 + X86_64_ONLY(gen_op_cvtsq2ss),
  2391 + X86_64_ONLY(gen_op_cvtsq2sd),
  2392 +
  2393 + gen_op_cvttss2si,
  2394 + gen_op_cvttsd2si,
  2395 + X86_64_ONLY(gen_op_cvttss2sq),
  2396 + X86_64_ONLY(gen_op_cvttsd2sq),
  2397 +
  2398 + gen_op_cvtss2si,
  2399 + gen_op_cvtsd2si,
  2400 + X86_64_ONLY(gen_op_cvtss2sq),
  2401 + X86_64_ONLY(gen_op_cvtsd2sq),
  2402 +};
  2403 +
  2404 +static GenOpFunc2 *sse_op_table4[8][4] = {
  2405 + SSE_FOP(cmpeq),
  2406 + SSE_FOP(cmplt),
  2407 + SSE_FOP(cmple),
  2408 + SSE_FOP(cmpunord),
  2409 + SSE_FOP(cmpneq),
  2410 + SSE_FOP(cmpnlt),
  2411 + SSE_FOP(cmpnle),
  2412 + SSE_FOP(cmpord),
  2413 +};
  2414 +
  2415 +static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
  2416 +{
  2417 + int b1, op1_offset, op2_offset, is_xmm, val, ot;
  2418 + int modrm, mod, rm, reg, reg_addr, offset_addr;
  2419 + GenOpFunc2 *sse_op2;
  2420 + GenOpFunc3 *sse_op3;
  2421 +
  2422 + b &= 0xff;
  2423 + if (s->prefix & PREFIX_DATA)
  2424 + b1 = 1;
  2425 + else if (s->prefix & PREFIX_REPZ)
  2426 + b1 = 2;
  2427 + else if (s->prefix & PREFIX_REPNZ)
  2428 + b1 = 3;
  2429 + else
  2430 + b1 = 0;
  2431 + sse_op2 = sse_op_table1[b][b1];
  2432 + if (!sse_op2)
  2433 + goto illegal_op;
  2434 + if (b <= 0x5f || b == 0xc6 || b == 0xc2) {
  2435 + is_xmm = 1;
  2436 + } else {
  2437 + if (b1 == 0) {
  2438 + /* MMX case */
  2439 + is_xmm = 0;
  2440 + } else {
  2441 + is_xmm = 1;
  2442 + }
  2443 + }
  2444 + /* simple MMX/SSE operation */
  2445 + if (s->flags & HF_TS_MASK) {
  2446 + gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
  2447 + return;
  2448 + }
  2449 + if (s->flags & HF_EM_MASK) {
  2450 + illegal_op:
  2451 + gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base);
  2452 + return;
  2453 + }
  2454 + if (is_xmm && !(s->flags & HF_OSFXSR_MASK))
  2455 + goto illegal_op;
  2456 + if (b == 0x77) {
  2457 + /* emms */
  2458 + gen_op_emms();
  2459 + return;
  2460 + }
  2461 + /* prepare MMX state (XXX: optimize by storing fptt and fptags in
  2462 + the static cpu state) */
  2463 + if (!is_xmm) {
  2464 + gen_op_enter_mmx();
  2465 + }
  2466 +
  2467 + modrm = ldub_code(s->pc++);
  2468 + reg = ((modrm >> 3) & 7);
  2469 + if (is_xmm)
  2470 + reg |= rex_r;
  2471 + mod = (modrm >> 6) & 3;
  2472 + if (sse_op2 == SSE_SPECIAL) {
  2473 + b |= (b1 << 8);
  2474 + switch(b) {
  2475 + case 0x0e7: /* movntq */
  2476 + if (mod == 3)
  2477 + goto illegal_op;
  2478 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2479 + gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,fpregs[reg].mmx));
  2480 + break;
  2481 + case 0x1e7: /* movntdq */
  2482 + case 0x02b: /* movntps */
  2483 + case 0x12b: /* movntps */
  2484 + case 0x2f0: /* lddqu */
  2485 + if (mod == 3)
  2486 + goto illegal_op;
  2487 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2488 + gen_sto_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
  2489 + break;
  2490 + case 0x6e: /* movd mm, ea */
  2491 + gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0);
  2492 + gen_op_movl_mm_T0_mmx(offsetof(CPUX86State,fpregs[reg].mmx));
  2493 + break;
  2494 + case 0x16e: /* movd xmm, ea */
  2495 + gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0);
  2496 + gen_op_movl_mm_T0_xmm(offsetof(CPUX86State,xmm_regs[reg]));
  2497 + break;
  2498 + case 0x6f: /* movq mm, ea */
  2499 + if (mod != 3) {
  2500 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2501 + gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,fpregs[reg].mmx));
  2502 + } else {
  2503 + rm = (modrm & 7);
  2504 + gen_op_movq(offsetof(CPUX86State,fpregs[reg].mmx),
  2505 + offsetof(CPUX86State,fpregs[rm].mmx));
  2506 + }
  2507 + break;
  2508 + case 0x010: /* movups */
  2509 + case 0x110: /* movupd */
  2510 + case 0x028: /* movaps */
  2511 + case 0x128: /* movapd */
  2512 + case 0x16f: /* movdqa xmm, ea */
  2513 + case 0x26f: /* movdqu xmm, ea */
  2514 + if (mod != 3) {
  2515 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2516 + gen_ldo_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
  2517 + } else {
  2518 + rm = (modrm & 7) | REX_B(s);
  2519 + gen_op_movo(offsetof(CPUX86State,xmm_regs[reg]),
  2520 + offsetof(CPUX86State,xmm_regs[rm]));
  2521 + }
  2522 + break;
  2523 + case 0x210: /* movss xmm, ea */
  2524 + if (mod != 3) {
  2525 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2526 + gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
  2527 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
  2528 + gen_op_movl_T0_0();
  2529 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)));
  2530 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
  2531 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
  2532 + } else {
  2533 + rm = (modrm & 7) | REX_B(s);
  2534 + gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
  2535 + offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)));
  2536 + }
  2537 + break;
  2538 + case 0x310: /* movsd xmm, ea */
  2539 + if (mod != 3) {
  2540 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2541 + gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2542 + gen_op_movl_T0_0();
  2543 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
  2544 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
  2545 + } else {
  2546 + rm = (modrm & 7) | REX_B(s);
  2547 + gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
  2548 + offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
  2549 + }
  2550 + break;
  2551 + case 0x012: /* movlps */
  2552 + case 0x112: /* movlpd */
  2553 + if (mod != 3) {
  2554 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2555 + gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2556 + } else {
  2557 + /* movhlps */
  2558 + rm = (modrm & 7) | REX_B(s);
  2559 + gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
  2560 + offsetof(CPUX86State,xmm_regs[rm].XMM_Q(1)));
  2561 + }
  2562 + break;
  2563 + case 0x016: /* movhps */
  2564 + case 0x116: /* movhpd */
  2565 + if (mod != 3) {
  2566 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2567 + gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
  2568 + } else {
  2569 + /* movlhps */
  2570 + rm = (modrm & 7) | REX_B(s);
  2571 + gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)),
  2572 + offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
  2573 + }
  2574 + break;
  2575 + case 0x216: /* movshdup */
  2576 + if (mod != 3) {
  2577 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2578 + gen_ldo_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
  2579 + } else {
  2580 + rm = (modrm & 7) | REX_B(s);
  2581 + gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)),
  2582 + offsetof(CPUX86State,xmm_regs[rm].XMM_L(1)));
  2583 + gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)),
  2584 + offsetof(CPUX86State,xmm_regs[rm].XMM_L(3)));
  2585 + }
  2586 + gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
  2587 + offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)));
  2588 + gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
  2589 + offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
  2590 + break;
  2591 + case 0x7e: /* movd ea, mm */
  2592 + gen_op_movl_T0_mm_mmx(offsetof(CPUX86State,fpregs[reg].mmx));
  2593 + gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1);
  2594 + break;
  2595 + case 0x17e: /* movd ea, xmm */
  2596 + gen_op_movl_T0_mm_xmm(offsetof(CPUX86State,xmm_regs[reg]));
  2597 + gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1);
  2598 + break;
  2599 + case 0x27e: /* movq xmm, ea */
  2600 + if (mod != 3) {
  2601 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2602 + gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2603 + } else {
  2604 + rm = (modrm & 7) | REX_B(s);
  2605 + gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
  2606 + offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
  2607 + }
  2608 + gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
  2609 + break;
  2610 + case 0x7f: /* movq ea, mm */
  2611 + if (mod != 3) {
  2612 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2613 + gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,fpregs[reg].mmx));
  2614 + } else {
  2615 + rm = (modrm & 7);
  2616 + gen_op_movq(offsetof(CPUX86State,fpregs[rm].mmx),
  2617 + offsetof(CPUX86State,fpregs[reg].mmx));
  2618 + }
  2619 + break;
  2620 + case 0x011: /* movups */
  2621 + case 0x111: /* movupd */
  2622 + case 0x029: /* movaps */
  2623 + case 0x129: /* movapd */
  2624 + case 0x17f: /* movdqa ea, xmm */
  2625 + case 0x27f: /* movdqu ea, xmm */
  2626 + if (mod != 3) {
  2627 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2628 + gen_sto_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
  2629 + } else {
  2630 + rm = (modrm & 7) | REX_B(s);
  2631 + gen_op_movo(offsetof(CPUX86State,xmm_regs[rm]),
  2632 + offsetof(CPUX86State,xmm_regs[reg]));
  2633 + }
  2634 + break;
  2635 + case 0x211: /* movss ea, xmm */
  2636 + if (mod != 3) {
  2637 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2638 + gen_op_movl_T0_env(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
  2639 + gen_op_st_T0_A0[OT_LONG + s->mem_index]();
  2640 + } else {
  2641 + rm = (modrm & 7) | REX_B(s);
  2642 + gen_op_movl(offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)),
  2643 + offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
  2644 + }
  2645 + break;
  2646 + case 0x311: /* movsd ea, xmm */
  2647 + if (mod != 3) {
  2648 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2649 + gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2650 + } else {
  2651 + rm = (modrm & 7) | REX_B(s);
  2652 + gen_op_movq(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)),
  2653 + offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2654 + }
  2655 + break;
  2656 + case 0x013: /* movlps */
  2657 + case 0x113: /* movlpd */
  2658 + if (mod != 3) {
  2659 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2660 + gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2661 + } else {
  2662 + goto illegal_op;
  2663 + }
  2664 + break;
  2665 + case 0x017: /* movhps */
  2666 + case 0x117: /* movhpd */
  2667 + if (mod != 3) {
  2668 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2669 + gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
  2670 + } else {
  2671 + goto illegal_op;
  2672 + }
  2673 + break;
  2674 + case 0x71: /* shift mm, im */
  2675 + case 0x72:
  2676 + case 0x73:
  2677 + case 0x171: /* shift xmm, im */
  2678 + case 0x172:
  2679 + case 0x173:
  2680 + val = ldub_code(s->pc++);
  2681 + if (is_xmm) {
  2682 + gen_op_movl_T0_im(val);
  2683 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(0)));
  2684 + gen_op_movl_T0_0();
  2685 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(1)));
  2686 + op1_offset = offsetof(CPUX86State,xmm_t0);
  2687 + } else {
  2688 + gen_op_movl_T0_im(val);
  2689 + gen_op_movl_env_T0(offsetof(CPUX86State,mmx_t0.MMX_L(0)));
  2690 + gen_op_movl_T0_0();
  2691 + gen_op_movl_env_T0(offsetof(CPUX86State,mmx_t0.MMX_L(1)));
  2692 + op1_offset = offsetof(CPUX86State,mmx_t0);
  2693 + }
  2694 + sse_op2 = sse_op_table2[((b - 1) & 3) * 8 + (((modrm >> 3)) & 7)][b1];
  2695 + if (!sse_op2)
  2696 + goto illegal_op;
  2697 + if (is_xmm) {
  2698 + rm = (modrm & 7) | REX_B(s);
  2699 + op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
  2700 + } else {
  2701 + rm = (modrm & 7);
  2702 + op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
  2703 + }
  2704 + sse_op2(op2_offset, op1_offset);
  2705 + break;
  2706 + case 0x050: /* movmskps */
  2707 + gen_op_movmskps(offsetof(CPUX86State,xmm_regs[reg]));
  2708 + rm = (modrm & 7) | REX_B(s);
  2709 + gen_op_mov_reg_T0[OT_LONG][rm]();
  2710 + break;
  2711 + case 0x150: /* movmskpd */
  2712 + gen_op_movmskpd(offsetof(CPUX86State,xmm_regs[reg]));
  2713 + rm = (modrm & 7) | REX_B(s);
  2714 + gen_op_mov_reg_T0[OT_LONG][rm]();
  2715 + break;
  2716 + case 0x02a: /* cvtpi2ps */
  2717 + case 0x12a: /* cvtpi2pd */
  2718 + gen_op_enter_mmx();
  2719 + if (mod != 3) {
  2720 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2721 + op2_offset = offsetof(CPUX86State,mmx_t0);
  2722 + gen_ldq_env_A0[s->mem_index >> 2](op2_offset);
  2723 + } else {
  2724 + rm = (modrm & 7);
  2725 + op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
  2726 + }
  2727 + op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
  2728 + switch(b >> 8) {
  2729 + case 0x0:
  2730 + gen_op_cvtpi2ps(op1_offset, op2_offset);
  2731 + break;
  2732 + default:
  2733 + case 0x1:
  2734 + gen_op_cvtpi2pd(op1_offset, op2_offset);
  2735 + break;
  2736 + }
  2737 + break;
  2738 + case 0x22a: /* cvtsi2ss */
  2739 + case 0x32a: /* cvtsi2sd */
  2740 + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
  2741 + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
  2742 + op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
  2743 + sse_op_table3[(s->dflag == 2) * 2 + ((b >> 8) - 2)](op1_offset);
  2744 + break;
  2745 + case 0x02c: /* cvttps2pi */
  2746 + case 0x12c: /* cvttpd2pi */
  2747 + case 0x02d: /* cvtps2pi */
  2748 + case 0x12d: /* cvtpd2pi */
  2749 + gen_op_enter_mmx();
  2750 + if (mod != 3) {
  2751 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2752 + op2_offset = offsetof(CPUX86State,xmm_t0);
  2753 + gen_ldo_env_A0[s->mem_index >> 2](op2_offset);
  2754 + } else {
  2755 + rm = (modrm & 7) | REX_B(s);
  2756 + op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
  2757 + }
  2758 + op1_offset = offsetof(CPUX86State,fpregs[reg & 7].mmx);
  2759 + switch(b) {
  2760 + case 0x02c:
  2761 + gen_op_cvttps2pi(op1_offset, op2_offset);
  2762 + break;
  2763 + case 0x12c:
  2764 + gen_op_cvttpd2pi(op1_offset, op2_offset);
  2765 + break;
  2766 + case 0x02d:
  2767 + gen_op_cvtps2pi(op1_offset, op2_offset);
  2768 + break;
  2769 + case 0x12d:
  2770 + gen_op_cvtpd2pi(op1_offset, op2_offset);
  2771 + break;
  2772 + }
  2773 + break;
  2774 + case 0x22c: /* cvttss2si */
  2775 + case 0x32c: /* cvttsd2si */
  2776 + case 0x22d: /* cvtss2si */
  2777 + case 0x32d: /* cvtsd2si */
  2778 + ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
  2779 + op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
  2780 + sse_op_table3[(s->dflag == 2) * 2 + ((b >> 8) - 2) + 4 +
  2781 + (b & 1) * 4](op1_offset);
  2782 + gen_ldst_modrm(s, modrm, ot, OR_TMP0, 1);
  2783 + break;
  2784 + case 0xc4: /* pinsrw */
  2785 + case 0x1c4:
  2786 + gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
  2787 + val = ldub_code(s->pc++);
  2788 + if (b1) {
  2789 + val &= 7;
  2790 + gen_op_pinsrw_xmm(offsetof(CPUX86State,xmm_regs[reg]), val);
  2791 + } else {
  2792 + val &= 3;
  2793 + gen_op_pinsrw_mmx(offsetof(CPUX86State,fpregs[reg].mmx), val);
  2794 + }
  2795 + break;
  2796 + case 0xc5: /* pextrw */
  2797 + case 0x1c5:
  2798 + if (mod != 3)
  2799 + goto illegal_op;
  2800 + val = ldub_code(s->pc++);
  2801 + if (b1) {
  2802 + val &= 7;
  2803 + rm = (modrm & 7) | REX_B(s);
  2804 + gen_op_pextrw_xmm(offsetof(CPUX86State,xmm_regs[rm]), val);
  2805 + } else {
  2806 + val &= 3;
  2807 + rm = (modrm & 7);
  2808 + gen_op_pextrw_mmx(offsetof(CPUX86State,fpregs[rm].mmx), val);
  2809 + }
  2810 + reg = ((modrm >> 3) & 7) | rex_r;
  2811 + gen_op_mov_reg_T0[OT_LONG][reg]();
  2812 + break;
  2813 + case 0x1d6: /* movq ea, xmm */
  2814 + if (mod != 3) {
  2815 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2816 + gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2817 + } else {
  2818 + rm = (modrm & 7) | REX_B(s);
  2819 + gen_op_movq(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)),
  2820 + offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2821 + gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(1)));
  2822 + }
  2823 + break;
  2824 + case 0x2d6: /* movq2dq */
  2825 + gen_op_enter_mmx();
  2826 + rm = (modrm & 7) | REX_B(s);
  2827 + gen_op_movq(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)),
  2828 + offsetof(CPUX86State,fpregs[reg & 7].mmx));
  2829 + gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(1)));
  2830 + break;
  2831 + case 0x3d6: /* movdq2q */
  2832 + gen_op_enter_mmx();
  2833 + rm = (modrm & 7);
  2834 + gen_op_movq(offsetof(CPUX86State,fpregs[rm].mmx),
  2835 + offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
  2836 + break;
  2837 + case 0xd7: /* pmovmskb */
  2838 + case 0x1d7:
  2839 + if (mod != 3)
  2840 + goto illegal_op;
  2841 + if (b1) {
  2842 + rm = (modrm & 7) | REX_B(s);
  2843 + gen_op_pmovmskb_xmm(offsetof(CPUX86State,xmm_regs[rm]));
  2844 + } else {
  2845 + rm = (modrm & 7);
  2846 + gen_op_pmovmskb_mmx(offsetof(CPUX86State,fpregs[rm].mmx));
  2847 + }
  2848 + reg = ((modrm >> 3) & 7) | rex_r;
  2849 + gen_op_mov_reg_T0[OT_LONG][reg]();
  2850 + break;
  2851 + default:
  2852 + goto illegal_op;
  2853 + }
  2854 + } else {
  2855 + /* generic MMX or SSE operation */
  2856 + if (b == 0xf7) {
  2857 + /* maskmov : we must prepare A0 */
  2858 + if (mod != 3)
  2859 + goto illegal_op;
  2860 +#ifdef TARGET_X86_64
  2861 + if (CODE64(s)) {
  2862 + gen_op_movq_A0_reg[R_EDI]();
  2863 + } else
  2864 +#endif
  2865 + {
  2866 + gen_op_movl_A0_reg[R_EDI]();
  2867 + if (s->aflag == 0)
  2868 + gen_op_andl_A0_ffff();
  2869 + }
  2870 + gen_add_A0_ds_seg(s);
  2871 + }
  2872 + if (is_xmm) {
  2873 + op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
  2874 + if (mod != 3) {
  2875 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2876 + op2_offset = offsetof(CPUX86State,xmm_t0);
  2877 + if (b1 >= 2 && ((b >= 0x50 && b <= 0x5f) ||
  2878 + b == 0xc2)) {
  2879 + /* specific case for SSE single instructions */
  2880 + if (b1 == 2) {
  2881 + /* 32 bit access */
  2882 + gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
  2883 + gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(0)));
  2884 + } else {
  2885 + /* 64 bit access */
  2886 + gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_t0.XMM_D(0)));
  2887 + }
  2888 + } else {
  2889 + gen_ldo_env_A0[s->mem_index >> 2](op2_offset);
  2890 + }
  2891 + } else {
  2892 + rm = (modrm & 7) | REX_B(s);
  2893 + op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
  2894 + }
  2895 + } else {
  2896 + op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
  2897 + if (mod != 3) {
  2898 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  2899 + op2_offset = offsetof(CPUX86State,mmx_t0);
  2900 + gen_ldq_env_A0[s->mem_index >> 2](op2_offset);
  2901 + } else {
  2902 + rm = (modrm & 7);
  2903 + op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
  2904 + }
  2905 + }
  2906 + switch(b) {
  2907 + case 0x70: /* pshufx insn */
  2908 + case 0xc6: /* pshufx insn */
  2909 + val = ldub_code(s->pc++);
  2910 + sse_op3 = (GenOpFunc3 *)sse_op2;
  2911 + sse_op3(op1_offset, op2_offset, val);
  2912 + break;
  2913 + case 0xc2:
  2914 + /* compare insns */
  2915 + val = ldub_code(s->pc++);
  2916 + if (val >= 8)
  2917 + goto illegal_op;
  2918 + sse_op2 = sse_op_table4[val][b1];
  2919 + sse_op2(op1_offset, op2_offset);
  2920 + break;
  2921 + default:
  2922 + sse_op2(op1_offset, op2_offset);
  2923 + break;
  2924 + }
  2925 + if (b == 0x2e || b == 0x2f) {
  2926 + s->cc_op = CC_OP_EFLAGS;
  2927 + }
  2928 + }
  2929 +}
  2930 +
  2931 +
2212 2932 /* convert one instruction. s->is_jmp is set if the translation must
2213 2933 be stopped. Return the next pc value */
2214 2934 static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
... ... @@ -3176,20 +3896,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
3176 3896 }
3177 3897 gen_op_movl_A0_im(offset_addr);
3178 3898 }
3179   - /* handle override */
3180   - {
3181   - int override, must_add_seg;
3182   - must_add_seg = s->addseg;
3183   - if (s->override >= 0) {
3184   - override = s->override;
3185   - must_add_seg = 1;
3186   - } else {
3187   - override = R_DS;
3188   - }
3189   - if (must_add_seg) {
3190   - gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
3191   - }
3192   - }
  3899 + gen_add_A0_ds_seg(s);
3193 3900 if ((b & 2) == 0) {
3194 3901 gen_op_ld_T0_A0[ot + s->mem_index]();
3195 3902 gen_op_mov_reg_T0[ot][R_EAX]();
... ... @@ -3212,21 +3919,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
3212 3919 if (s->aflag == 0)
3213 3920 gen_op_andl_A0_ffff();
3214 3921 }
3215   - /* handle override */
3216   - {
3217   - int override, must_add_seg;
3218   - must_add_seg = s->addseg;
3219   - override = R_DS;
3220   - if (s->override >= 0) {
3221   - override = s->override;
3222   - must_add_seg = 1;
3223   - } else {
3224   - override = R_DS;
3225   - }
3226   - if (must_add_seg) {
3227   - gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
3228   - }
3229   - }
  3922 + gen_add_A0_ds_seg(s);
3230 3923 gen_op_ldu_T0_A0[OT_BYTE + s->mem_index]();
3231 3924 gen_op_mov_reg_T0[OT_BYTE][R_EAX]();
3232 3925 break;
... ... @@ -4827,33 +5520,6 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
4827 5520 /* nothing to do */
4828 5521 }
4829 5522 break;
4830   - case 0x1ae:
4831   - modrm = ldub_code(s->pc++);
4832   - mod = (modrm >> 6) & 3;
4833   - op = (modrm >> 3) & 7;
4834   - switch(op) {
4835   - case 0: /* fxsave */
4836   - if (mod == 3 || !(s->cpuid_features & CPUID_FXSR))
4837   - goto illegal_op;
4838   - gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
4839   - gen_op_fxsave_A0((s->dflag == 2));
4840   - break;
4841   - case 1: /* fxrstor */
4842   - if (mod == 3 || !(s->cpuid_features & CPUID_FXSR))
4843   - goto illegal_op;
4844   - gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
4845   - gen_op_fxrstor_A0((s->dflag == 2));
4846   - break;
4847   - case 5: /* lfence */
4848   - case 6: /* mfence */
4849   - case 7: /* sfence */
4850   - if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & CPUID_SSE))
4851   - goto illegal_op;
4852   - break;
4853   - default:
4854   - goto illegal_op;
4855   - }
4856   - break;
4857 5523 case 0x63: /* arpl or movslS (x86_64) */
4858 5524 #ifdef TARGET_X86_64
4859 5525 if (CODE64(s)) {
... ... @@ -5018,65 +5684,73 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
5018 5684 gen_eob(s);
5019 5685 }
5020 5686 break;
5021   - /* SSE support */
5022   - case 0x16f:
5023   - if (prefixes & PREFIX_DATA) {
5024   - /* movdqa xmm1, xmm2/mem128 */
5025   - if (!(s->cpuid_features & CPUID_SSE))
5026   - goto illegal_op;
5027   - modrm = ldub_code(s->pc++);
5028   - reg = ((modrm >> 3) & 7) | rex_r;
5029   - mod = (modrm >> 6) & 3;
5030   - if (mod != 3) {
5031   - gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
5032   - gen_ldo_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
5033   - } else {
5034   - rm = (modrm & 7) | REX_B(s);
5035   - gen_op_movo(offsetof(CPUX86State,xmm_regs[reg]),
5036   - offsetof(CPUX86State,xmm_regs[rm]));
5037   - }
5038   - } else {
  5687 + /* MMX/SSE/SSE2/PNI support */
  5688 + case 0x1c3: /* MOVNTI reg, mem */
  5689 + if (!(s->cpuid_features & CPUID_SSE2))
5039 5690 goto illegal_op;
5040   - }
  5691 + ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
  5692 + modrm = ldub_code(s->pc++);
  5693 + mod = (modrm >> 6) & 3;
  5694 + if (mod == 3)
  5695 + goto illegal_op;
  5696 + reg = ((modrm >> 3) & 7) | rex_r;
  5697 + /* generate a generic store */
  5698 + gen_ldst_modrm(s, modrm, ot, reg, 1);
5041 5699 break;
5042   - case 0x1e7:
5043   - if (prefixes & PREFIX_DATA) {
5044   - /* movntdq mem128, xmm1 */
5045   - if (!(s->cpuid_features & CPUID_SSE))
  5700 + case 0x1ae:
  5701 + modrm = ldub_code(s->pc++);
  5702 + mod = (modrm >> 6) & 3;
  5703 + op = (modrm >> 3) & 7;
  5704 + switch(op) {
  5705 + case 0: /* fxsave */
  5706 + if (mod == 3 || !(s->cpuid_features & CPUID_FXSR))
5046 5707 goto illegal_op;
5047   - modrm = ldub_code(s->pc++);
5048   - reg = ((modrm >> 3) & 7) | rex_r;
5049   - mod = (modrm >> 6) & 3;
5050   - if (mod != 3) {
5051   - gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
5052   - gen_sto_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
5053   - } else {
  5708 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  5709 + gen_op_fxsave_A0((s->dflag == 2));
  5710 + break;
  5711 + case 1: /* fxrstor */
  5712 + if (mod == 3 || !(s->cpuid_features & CPUID_FXSR))
5054 5713 goto illegal_op;
  5714 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  5715 + gen_op_fxrstor_A0((s->dflag == 2));
  5716 + break;
  5717 + case 2: /* ldmxcsr */
  5718 + case 3: /* stmxcsr */
  5719 + if (s->flags & HF_TS_MASK) {
  5720 + gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
  5721 + break;
5055 5722 }
5056   - } else {
5057   - goto illegal_op;
5058   - }
5059   - break;
5060   - case 0x17f:
5061   - if (prefixes & PREFIX_DATA) {
5062   - /* movdqa xmm2/mem128, xmm1 */
5063   - if (!(s->cpuid_features & CPUID_SSE))
  5723 + if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK) ||
  5724 + mod == 3)
5064 5725 goto illegal_op;
5065   - modrm = ldub_code(s->pc++);
5066   - reg = ((modrm >> 3) & 7) | rex_r;
5067   - mod = (modrm >> 6) & 3;
5068   - if (mod != 3) {
5069   - gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
5070   - gen_sto_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
  5726 + gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
  5727 + if (op == 2) {
  5728 + gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
  5729 + gen_op_movl_env_T0(offsetof(CPUX86State, mxcsr));
5071 5730 } else {
5072   - rm = (modrm & 7) | REX_B(s);
5073   - gen_op_movo(offsetof(CPUX86State,xmm_regs[rm]),
5074   - offsetof(CPUX86State,xmm_regs[reg]));
  5731 + gen_op_movl_T0_env(offsetof(CPUX86State, mxcsr));
  5732 + gen_op_st_T0_A0[OT_LONG + s->mem_index]();
5075 5733 }
5076   - } else {
  5734 + break;
  5735 + case 5: /* lfence */
  5736 + case 6: /* mfence */
  5737 + case 7: /* sfence */
  5738 + if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & CPUID_SSE))
  5739 + goto illegal_op;
  5740 + break;
  5741 + default:
5077 5742 goto illegal_op;
5078 5743 }
5079 5744 break;
  5745 + case 0x110 ... 0x117:
  5746 + case 0x128 ... 0x12f:
  5747 + case 0x150 ... 0x177:
  5748 + case 0x17c ... 0x17f:
  5749 + case 0x1c2:
  5750 + case 0x1c4 ... 0x1c6:
  5751 + case 0x1d0 ... 0x1fe:
  5752 + gen_sse(s, b, pc_start, rex_r);
  5753 + break;
5080 5754 default:
5081 5755 goto illegal_op;
5082 5756 }
... ... @@ -5250,6 +5924,12 @@ static uint16_t opc_write_flags[NB_OPS] = {
5250 5924 [INDEX_op_imull_T0_T1] = CC_OSZAPC,
5251 5925 X86_64_DEF([INDEX_op_imulq_T0_T1] = CC_OSZAPC,)
5252 5926  
  5927 + /* sse */
  5928 + [INDEX_op_ucomiss] = CC_OSZAPC,
  5929 + [INDEX_op_ucomisd] = CC_OSZAPC,
  5930 + [INDEX_op_comiss] = CC_OSZAPC,
  5931 + [INDEX_op_comisd] = CC_OSZAPC,
  5932 +
5253 5933 /* bcd */
5254 5934 [INDEX_op_aam] = CC_OSZAPC,
5255 5935 [INDEX_op_aad] = CC_OSZAPC,
... ...
... ... @@ -2082,15 +2082,14 @@ static void cpu_get_seg(QEMUFile *f, SegmentCache *dt)
2082 2082 void cpu_save(QEMUFile *f, void *opaque)
2083 2083 {
2084 2084 CPUState *env = opaque;
2085   - uint16_t fptag, fpus, fpuc;
  2085 + uint16_t fptag, fpus, fpuc, fpregs_format;
2086 2086 uint32_t hflags;
2087 2087 int i;
2088   -
  2088 +
2089 2089 for(i = 0; i < CPU_NB_REGS; i++)
2090 2090 qemu_put_betls(f, &env->regs[i]);
2091 2091 qemu_put_betls(f, &env->eip);
2092 2092 qemu_put_betls(f, &env->eflags);
2093   - qemu_put_betl(f, 0); /* XXX: suppress that */
2094 2093 hflags = env->hflags; /* XXX: suppress most of the redundant hflags */
2095 2094 qemu_put_be32s(f, &hflags);
2096 2095  
... ... @@ -2098,23 +2097,37 @@ void cpu_save(QEMUFile *f, void *opaque)
2098 2097 fpuc = env->fpuc;
2099 2098 fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2100 2099 fptag = 0;
2101   - for (i=7; i>=0; i--) {
2102   - fptag <<= 2;
2103   - if (env->fptags[i]) {
2104   - fptag |= 3;
2105   - }
  2100 + for(i = 0; i < 8; i++) {
  2101 + fptag |= ((!env->fptags[i]) << i);
2106 2102 }
2107 2103  
2108 2104 qemu_put_be16s(f, &fpuc);
2109 2105 qemu_put_be16s(f, &fpus);
2110 2106 qemu_put_be16s(f, &fptag);
2111 2107  
  2108 +#ifdef USE_X86LDOUBLE
  2109 + fpregs_format = 0;
  2110 +#else
  2111 + fpregs_format = 1;
  2112 +#endif
  2113 + qemu_put_be16s(f, &fpregs_format);
  2114 +
2112 2115 for(i = 0; i < 8; i++) {
2113 2116 uint64_t mant;
2114 2117 uint16_t exp;
2115   - cpu_get_fp80(&mant, &exp, env->fpregs[i]);
  2118 +#ifdef USE_X86LDOUBLE
  2119 + /* we save the real CPU data (in case of MMX usage only 'mant'
  2120 + contains the MMX register */
  2121 + cpu_get_fp80(&mant, &exp, env->fpregs[i].d);
2116 2122 qemu_put_be64(f, mant);
2117 2123 qemu_put_be16(f, exp);
  2124 +#else
  2125 + /* if we use doubles for float emulation, we save the doubles to
  2126 + avoid losing information in case of MMX usage. It can give
  2127 + problems if the image is restored on a CPU where long
  2128 + doubles are used instead. */
  2129 + qemu_put_be64(f, env->fpregs[i].xmm.MMX_Q(0));
  2130 +#endif
2118 2131 }
2119 2132  
2120 2133 for(i = 0; i < 6; i++)
... ... @@ -2139,12 +2152,14 @@ void cpu_save(QEMUFile *f, void *opaque)
2139 2152 /* MMU */
2140 2153 qemu_put_be32s(f, &env->a20_mask);
2141 2154  
2142   -#ifdef TARGET_X86_64
  2155 + /* XMM */
  2156 + qemu_put_be32s(f, &env->mxcsr);
2143 2157 for(i = 0; i < CPU_NB_REGS; i++) {
2144 2158 qemu_put_be64s(f, &env->xmm_regs[i].XMM_Q(0));
2145 2159 qemu_put_be64s(f, &env->xmm_regs[i].XMM_Q(1));
2146 2160 }
2147 2161  
  2162 +#ifdef TARGET_X86_64
2148 2163 qemu_put_be64s(f, &env->efer);
2149 2164 qemu_put_be64s(f, &env->star);
2150 2165 qemu_put_be64s(f, &env->lstar);
... ... @@ -2154,40 +2169,97 @@ void cpu_save(QEMUFile *f, void *opaque)
2154 2169 #endif
2155 2170 }
2156 2171  
  2172 +/* XXX: add that in a FPU generic layer */
  2173 +union x86_longdouble {
  2174 + uint64_t mant;
  2175 + uint16_t exp;
  2176 +};
  2177 +
  2178 +#define MANTD1(fp) (fp & ((1LL << 52) - 1))
  2179 +#define EXPBIAS1 1023
  2180 +#define EXPD1(fp) ((fp >> 52) & 0x7FF)
  2181 +#define SIGND1(fp) ((fp >> 32) & 0x80000000)
  2182 +
  2183 +static void fp64_to_fp80(union x86_longdouble *p, uint64_t temp)
  2184 +{
  2185 + int e;
  2186 + /* mantissa */
  2187 + p->mant = (MANTD1(temp) << 11) | (1LL << 63);
  2188 + /* exponent + sign */
  2189 + e = EXPD1(temp) - EXPBIAS1 + 16383;
  2190 + e |= SIGND1(temp) >> 16;
  2191 + p->exp = e;
  2192 +}
  2193 +
2157 2194 int cpu_load(QEMUFile *f, void *opaque, int version_id)
2158 2195 {
2159 2196 CPUState *env = opaque;
2160   - int i;
  2197 + int i, guess_mmx;
2161 2198 uint32_t hflags;
2162   - uint16_t fpus, fpuc, fptag;
  2199 + uint16_t fpus, fpuc, fptag, fpregs_format;
2163 2200  
2164   - if (version_id != 2)
  2201 + if (version_id != 3)
2165 2202 return -EINVAL;
2166 2203 for(i = 0; i < CPU_NB_REGS; i++)
2167 2204 qemu_get_betls(f, &env->regs[i]);
2168 2205 qemu_get_betls(f, &env->eip);
2169 2206 qemu_get_betls(f, &env->eflags);
2170   - qemu_get_betl(f); /* XXX: suppress that */
2171 2207 qemu_get_be32s(f, &hflags);
2172 2208  
2173 2209 qemu_get_be16s(f, &fpuc);
2174 2210 qemu_get_be16s(f, &fpus);
2175 2211 qemu_get_be16s(f, &fptag);
2176   -
  2212 + qemu_get_be16s(f, &fpregs_format);
  2213 +
  2214 + /* NOTE: we cannot always restore the FPU state if the image come
  2215 + from a host with a different 'USE_X86LDOUBLE' define. We guess
  2216 + if we are in an MMX state to restore correctly in that case. */
  2217 + guess_mmx = ((fptag == 0xff) && (fpus & 0x3800) == 0);
2177 2218 for(i = 0; i < 8; i++) {
2178 2219 uint64_t mant;
2179 2220 uint16_t exp;
2180   - mant = qemu_get_be64(f);
2181   - exp = qemu_get_be16(f);
2182   - env->fpregs[i] = cpu_set_fp80(mant, exp);
  2221 + union x86_longdouble *p;
  2222 +
  2223 + switch(fpregs_format) {
  2224 + case 0:
  2225 + mant = qemu_get_be64(f);
  2226 + exp = qemu_get_be16(f);
  2227 +#ifdef USE_X86LDOUBLE
  2228 + env->fpregs[i].d = cpu_set_fp80(mant, exp);
  2229 +#else
  2230 + /* difficult case */
  2231 + if (guess_mmx)
  2232 + env->fpregs[i].xmm.MMX_Q(0) = mant;
  2233 + else
  2234 + env->fpregs[i].d = cpu_set_fp80(mant, exp);
  2235 +#endif
  2236 + break;
  2237 + case 1:
  2238 + mant = qemu_get_be64(f);
  2239 +#ifdef USE_X86LDOUBLE
  2240 + /* difficult case */
  2241 + p = (void *)&env->fpregs[i];
  2242 + if (guess_mmx) {
  2243 + p->mant = mant;
  2244 + p->exp = 0xffff;
  2245 + } else {
  2246 + fp64_to_fp80(p, mant);
  2247 + }
  2248 +#else
  2249 + env->fpregs[i].xmm.MMX_Q(0) = mant;
  2250 +#endif
  2251 + break;
  2252 + default:
  2253 + return -EINVAL;
  2254 + }
2183 2255 }
2184 2256  
2185 2257 env->fpuc = fpuc;
2186 2258 env->fpstt = (fpus >> 11) & 7;
2187 2259 env->fpus = fpus & ~0x3800;
  2260 + fptag ^= 0xff;
2188 2261 for(i = 0; i < 8; i++) {
2189   - env->fptags[i] = ((fptag & 3) == 3);
2190   - fptag >>= 2;
  2262 + env->fptags[i] = (fptag >> i) & 1;
2191 2263 }
2192 2264  
2193 2265 for(i = 0; i < 6; i++)
... ... @@ -2212,12 +2284,13 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id)
2212 2284 /* MMU */
2213 2285 qemu_get_be32s(f, &env->a20_mask);
2214 2286  
2215   -#ifdef TARGET_X86_64
  2287 + qemu_get_be32s(f, &env->mxcsr);
2216 2288 for(i = 0; i < CPU_NB_REGS; i++) {
2217 2289 qemu_get_be64s(f, &env->xmm_regs[i].XMM_Q(0));
2218 2290 qemu_get_be64s(f, &env->xmm_regs[i].XMM_Q(1));
2219 2291 }
2220 2292  
  2293 +#ifdef TARGET_X86_64
2221 2294 qemu_get_be64s(f, &env->efer);
2222 2295 qemu_get_be64s(f, &env->star);
2223 2296 qemu_get_be64s(f, &env->lstar);
... ... @@ -3433,7 +3506,7 @@ int main(int argc, char **argv)
3433 3506 cpu_single_env = env;
3434 3507  
3435 3508 register_savevm("timer", 0, 1, timer_save, timer_load, env);
3436   - register_savevm("cpu", 0, 2, cpu_save, cpu_load, env);
  3509 + register_savevm("cpu", 0, 3, cpu_save, cpu_load, env);
3437 3510 register_savevm("ram", 0, 1, ram_save, ram_load, NULL);
3438 3511 qemu_register_reset(main_cpu_reset, global_env);
3439 3512  
... ...