Commit 3c529d935923a70519557d420db1d5a09a65086a

Authored by aliguori
1 parent 8de24106

Replace posix-aio with custom thread pool

glibc implements posix-aio as a thread pool and imposes a number of limitations.

1) it limits one request per-file descriptor.  we hack around this by dup()'ing
file descriptors which is hideously ugly

2) it's impossible to add new interfaces and we need a vectored read/write
operation to properly support a zero-copy API.

What has been suggested to me by glibc folks, is to implement whatever new
interfaces we want and then it can eventually be proposed for standardization.
This requires that we implement our own posix-aio implementation though.

This patch implements posix-aio using pthreads.  It immediately eliminates the
need for fd pooling.

It performs at least as well as the current posix-aio code (in some
circumstances, even better).

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>



git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5996 c046a42c-6fe2-441c-8c8c-71466251a162
Makefile
@@ -56,6 +56,9 @@ BLOCK_OBJS+=nbd.o block.o aio.o @@ -56,6 +56,9 @@ BLOCK_OBJS+=nbd.o block.o aio.o
56 ifdef CONFIG_WIN32 56 ifdef CONFIG_WIN32
57 BLOCK_OBJS += block-raw-win32.o 57 BLOCK_OBJS += block-raw-win32.o
58 else 58 else
  59 +ifdef CONFIG_AIO
  60 +BLOCK_OBJS += posix-aio-compat.o
  61 +endif
59 BLOCK_OBJS += block-raw-posix.o 62 BLOCK_OBJS += block-raw-posix.o
60 endif 63 endif
61 64
Makefile.target
@@ -564,6 +564,9 @@ endif @@ -564,6 +564,9 @@ endif
564 ifdef CONFIG_WIN32 564 ifdef CONFIG_WIN32
565 OBJS+=block-raw-win32.o 565 OBJS+=block-raw-win32.o
566 else 566 else
  567 +ifdef CONFIG_AIO
  568 +OBJS+=posix-aio-compat.o
  569 +endif
567 OBJS+=block-raw-posix.o 570 OBJS+=block-raw-posix.o
568 endif 571 endif
569 572
block-raw-posix.c
@@ -27,7 +27,7 @@ @@ -27,7 +27,7 @@
27 #include "block_int.h" 27 #include "block_int.h"
28 #include <assert.h> 28 #include <assert.h>
29 #ifdef CONFIG_AIO 29 #ifdef CONFIG_AIO
30 -#include <aio.h> 30 +#include "posix-aio-compat.h"
31 #endif 31 #endif
32 32
33 #ifdef CONFIG_COCOA 33 #ifdef CONFIG_COCOA
@@ -93,16 +93,10 @@ @@ -93,16 +93,10 @@
93 reopen it to see if the disk has been changed */ 93 reopen it to see if the disk has been changed */
94 #define FD_OPEN_TIMEOUT 1000 94 #define FD_OPEN_TIMEOUT 1000
95 95
96 -/* posix-aio doesn't allow multiple outstanding requests to a single file  
97 - * descriptor. we implement a pool of dup()'d file descriptors to work  
98 - * around this */  
99 -#define RAW_FD_POOL_SIZE 64  
100 -  
101 typedef struct BDRVRawState { 96 typedef struct BDRVRawState {
102 int fd; 97 int fd;
103 int type; 98 int type;
104 unsigned int lseek_err_cnt; 99 unsigned int lseek_err_cnt;
105 - int fd_pool[RAW_FD_POOL_SIZE];  
106 #if defined(__linux__) 100 #if defined(__linux__)
107 /* linux floppy specific */ 101 /* linux floppy specific */
108 int fd_open_flags; 102 int fd_open_flags;
@@ -122,7 +116,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) @@ -122,7 +116,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
122 { 116 {
123 BDRVRawState *s = bs->opaque; 117 BDRVRawState *s = bs->opaque;
124 int fd, open_flags, ret; 118 int fd, open_flags, ret;
125 - int i;  
126 119
127 posix_aio_init(); 120 posix_aio_init();
128 121
@@ -155,8 +148,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) @@ -155,8 +148,6 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
155 return ret; 148 return ret;
156 } 149 }
157 s->fd = fd; 150 s->fd = fd;
158 - for (i = 0; i < RAW_FD_POOL_SIZE; i++)  
159 - s->fd_pool[i] = -1;  
160 s->aligned_buf = NULL; 151 s->aligned_buf = NULL;
161 if ((flags & BDRV_O_NOCACHE)) { 152 if ((flags & BDRV_O_NOCACHE)) {
162 s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE); 153 s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE);
@@ -446,8 +437,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset, @@ -446,8 +437,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
446 437
447 typedef struct RawAIOCB { 438 typedef struct RawAIOCB {
448 BlockDriverAIOCB common; 439 BlockDriverAIOCB common;
449 - int fd;  
450 - struct aiocb aiocb; 440 + struct qemu_paiocb aiocb;
451 struct RawAIOCB *next; 441 struct RawAIOCB *next;
452 int ret; 442 int ret;
453 } RawAIOCB; 443 } RawAIOCB;
@@ -458,38 +448,6 @@ typedef struct PosixAioState @@ -458,38 +448,6 @@ typedef struct PosixAioState
458 RawAIOCB *first_aio; 448 RawAIOCB *first_aio;
459 } PosixAioState; 449 } PosixAioState;
460 450
461 -static int raw_fd_pool_get(BDRVRawState *s)  
462 -{  
463 - int i;  
464 -  
465 - for (i = 0; i < RAW_FD_POOL_SIZE; i++) {  
466 - /* already in use */  
467 - if (s->fd_pool[i] != -1)  
468 - continue;  
469 -  
470 - /* try to dup file descriptor */  
471 - s->fd_pool[i] = dup(s->fd);  
472 - if (s->fd_pool[i] != -1)  
473 - return s->fd_pool[i];  
474 - }  
475 -  
476 - /* we couldn't dup the file descriptor so just use the main one */  
477 - return s->fd;  
478 -}  
479 -  
480 -static void raw_fd_pool_put(RawAIOCB *acb)  
481 -{  
482 - BDRVRawState *s = acb->common.bs->opaque;  
483 - int i;  
484 -  
485 - for (i = 0; i < RAW_FD_POOL_SIZE; i++) {  
486 - if (s->fd_pool[i] == acb->fd) {  
487 - close(s->fd_pool[i]);  
488 - s->fd_pool[i] = -1;  
489 - }  
490 - }  
491 -}  
492 -  
493 static void posix_aio_read(void *opaque) 451 static void posix_aio_read(void *opaque)
494 { 452 {
495 PosixAioState *s = opaque; 453 PosixAioState *s = opaque;
@@ -515,16 +473,15 @@ static void posix_aio_read(void *opaque) @@ -515,16 +473,15 @@ static void posix_aio_read(void *opaque)
515 acb = *pacb; 473 acb = *pacb;
516 if (!acb) 474 if (!acb)
517 goto the_end; 475 goto the_end;
518 - ret = aio_error(&acb->aiocb); 476 + ret = qemu_paio_error(&acb->aiocb);
519 if (ret == ECANCELED) { 477 if (ret == ECANCELED) {
520 /* remove the request */ 478 /* remove the request */
521 *pacb = acb->next; 479 *pacb = acb->next;
522 - raw_fd_pool_put(acb);  
523 qemu_aio_release(acb); 480 qemu_aio_release(acb);
524 } else if (ret != EINPROGRESS) { 481 } else if (ret != EINPROGRESS) {
525 /* end of aio */ 482 /* end of aio */
526 if (ret == 0) { 483 if (ret == 0) {
527 - ret = aio_return(&acb->aiocb); 484 + ret = qemu_paio_return(&acb->aiocb);
528 if (ret == acb->aiocb.aio_nbytes) 485 if (ret == acb->aiocb.aio_nbytes)
529 ret = 0; 486 ret = 0;
530 else 487 else
@@ -536,7 +493,6 @@ static void posix_aio_read(void *opaque) @@ -536,7 +493,6 @@ static void posix_aio_read(void *opaque)
536 *pacb = acb->next; 493 *pacb = acb->next;
537 /* call the callback */ 494 /* call the callback */
538 acb->common.cb(acb->common.opaque, ret); 495 acb->common.cb(acb->common.opaque, ret);
539 - raw_fd_pool_put(acb);  
540 qemu_aio_release(acb); 496 qemu_aio_release(acb);
541 break; 497 break;
542 } else { 498 } else {
@@ -571,6 +527,7 @@ static int posix_aio_init(void) @@ -571,6 +527,7 @@ static int posix_aio_init(void)
571 struct sigaction act; 527 struct sigaction act;
572 PosixAioState *s; 528 PosixAioState *s;
573 int fds[2]; 529 int fds[2];
  530 + struct qemu_paioinit ai;
574 531
575 if (posix_aio_state) 532 if (posix_aio_state)
576 return 0; 533 return 0;
@@ -598,24 +555,11 @@ static int posix_aio_init(void) @@ -598,24 +555,11 @@ static int posix_aio_init(void)
598 555
599 qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s); 556 qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
600 557
601 -#if defined(__linux__)  
602 - {  
603 - struct aioinit ai; 558 + memset(&ai, 0, sizeof(ai));
  559 + ai.aio_threads = 64;
  560 + ai.aio_num = 64;
  561 + qemu_paio_init(&ai);
604 562
605 - memset(&ai, 0, sizeof(ai));  
606 -#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 4)  
607 - ai.aio_threads = 64;  
608 - ai.aio_num = 64;  
609 -#else  
610 - /* XXX: aio thread exit seems to hang on RedHat 9 and this init  
611 - seems to fix the problem. */  
612 - ai.aio_threads = 1;  
613 - ai.aio_num = 1;  
614 - ai.aio_idle_time = 365 * 100000;  
615 -#endif  
616 - aio_init(&ai);  
617 - }  
618 -#endif  
619 posix_aio_state = s; 563 posix_aio_state = s;
620 564
621 return 0; 565 return 0;
@@ -634,8 +578,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, @@ -634,8 +578,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
634 acb = qemu_aio_get(bs, cb, opaque); 578 acb = qemu_aio_get(bs, cb, opaque);
635 if (!acb) 579 if (!acb)
636 return NULL; 580 return NULL;
637 - acb->fd = raw_fd_pool_get(s);  
638 - acb->aiocb.aio_fildes = acb->fd; 581 + acb->aiocb.aio_fildes = s->fd;
639 acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; 582 acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
640 acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; 583 acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
641 acb->aiocb.aio_buf = buf; 584 acb->aiocb.aio_buf = buf;
@@ -680,7 +623,7 @@ static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, @@ -680,7 +623,7 @@ static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs,
680 acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); 623 acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
681 if (!acb) 624 if (!acb)
682 return NULL; 625 return NULL;
683 - if (aio_read(&acb->aiocb) < 0) { 626 + if (qemu_paio_read(&acb->aiocb) < 0) {
684 qemu_aio_release(acb); 627 qemu_aio_release(acb);
685 return NULL; 628 return NULL;
686 } 629 }
@@ -711,7 +654,7 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, @@ -711,7 +654,7 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
711 acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); 654 acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
712 if (!acb) 655 if (!acb)
713 return NULL; 656 return NULL;
714 - if (aio_write(&acb->aiocb) < 0) { 657 + if (qemu_paio_write(&acb->aiocb) < 0) {
715 qemu_aio_release(acb); 658 qemu_aio_release(acb);
716 return NULL; 659 return NULL;
717 } 660 }
@@ -724,11 +667,11 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) @@ -724,11 +667,11 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
724 RawAIOCB *acb = (RawAIOCB *)blockacb; 667 RawAIOCB *acb = (RawAIOCB *)blockacb;
725 RawAIOCB **pacb; 668 RawAIOCB **pacb;
726 669
727 - ret = aio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);  
728 - if (ret == AIO_NOTCANCELED) { 670 + ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
  671 + if (ret == QEMU_PAIO_NOTCANCELED) {
729 /* fail safe: if the aio could not be canceled, we wait for 672 /* fail safe: if the aio could not be canceled, we wait for
730 it */ 673 it */
731 - while (aio_error(&acb->aiocb) == EINPROGRESS); 674 + while (qemu_paio_error(&acb->aiocb) == EINPROGRESS);
732 } 675 }
733 676
734 /* remove the callback from the queue */ 677 /* remove the callback from the queue */
@@ -738,14 +681,12 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) @@ -738,14 +681,12 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
738 break; 681 break;
739 } else if (*pacb == acb) { 682 } else if (*pacb == acb) {
740 *pacb = acb->next; 683 *pacb = acb->next;
741 - raw_fd_pool_put(acb);  
742 qemu_aio_release(acb); 684 qemu_aio_release(acb);
743 break; 685 break;
744 } 686 }
745 pacb = &acb->next; 687 pacb = &acb->next;
746 } 688 }
747 } 689 }
748 -  
749 #else /* CONFIG_AIO */ 690 #else /* CONFIG_AIO */
750 static int posix_aio_init(void) 691 static int posix_aio_init(void)
751 { 692 {
@@ -753,17 +694,6 @@ static int posix_aio_init(void) @@ -753,17 +694,6 @@ static int posix_aio_init(void)
753 } 694 }
754 #endif /* CONFIG_AIO */ 695 #endif /* CONFIG_AIO */
755 696
756 -static void raw_close_fd_pool(BDRVRawState *s)  
757 -{  
758 - int i;  
759 -  
760 - for (i = 0; i < RAW_FD_POOL_SIZE; i++) {  
761 - if (s->fd_pool[i] != -1) {  
762 - close(s->fd_pool[i]);  
763 - s->fd_pool[i] = -1;  
764 - }  
765 - }  
766 -}  
767 697
768 static void raw_close(BlockDriverState *bs) 698 static void raw_close(BlockDriverState *bs)
769 { 699 {
@@ -774,7 +704,6 @@ static void raw_close(BlockDriverState *bs) @@ -774,7 +704,6 @@ static void raw_close(BlockDriverState *bs)
774 if (s->aligned_buf != NULL) 704 if (s->aligned_buf != NULL)
775 qemu_free(s->aligned_buf); 705 qemu_free(s->aligned_buf);
776 } 706 }
777 - raw_close_fd_pool(s);  
778 } 707 }
779 708
780 static int raw_truncate(BlockDriverState *bs, int64_t offset) 709 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -895,6 +824,7 @@ BlockDriver bdrv_raw = { @@ -895,6 +824,7 @@ BlockDriver bdrv_raw = {
895 .bdrv_aio_cancel = raw_aio_cancel, 824 .bdrv_aio_cancel = raw_aio_cancel,
896 .aiocb_size = sizeof(RawAIOCB), 825 .aiocb_size = sizeof(RawAIOCB),
897 #endif 826 #endif
  827 +
898 .bdrv_pread = raw_pread, 828 .bdrv_pread = raw_pread,
899 .bdrv_pwrite = raw_pwrite, 829 .bdrv_pwrite = raw_pwrite,
900 .bdrv_truncate = raw_truncate, 830 .bdrv_truncate = raw_truncate,
@@ -965,7 +895,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma @@ -965,7 +895,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
965 static int hdev_open(BlockDriverState *bs, const char *filename, int flags) 895 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
966 { 896 {
967 BDRVRawState *s = bs->opaque; 897 BDRVRawState *s = bs->opaque;
968 - int fd, open_flags, ret, i; 898 + int fd, open_flags, ret;
969 899
970 posix_aio_init(); 900 posix_aio_init();
971 901
@@ -1032,8 +962,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) @@ -1032,8 +962,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
1032 return ret; 962 return ret;
1033 } 963 }
1034 s->fd = fd; 964 s->fd = fd;
1035 - for (i = 0; i < RAW_FD_POOL_SIZE; i++)  
1036 - s->fd_pool[i] = -1;  
1037 #if defined(__linux__) 965 #if defined(__linux__)
1038 /* close fd so that we can reopen it as needed */ 966 /* close fd so that we can reopen it as needed */
1039 if (s->type == FTYPE_FD) { 967 if (s->type == FTYPE_FD) {
@@ -1061,7 +989,6 @@ static int fd_open(BlockDriverState *bs) @@ -1061,7 +989,6 @@ static int fd_open(BlockDriverState *bs)
1061 (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { 989 (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
1062 close(s->fd); 990 close(s->fd);
1063 s->fd = -1; 991 s->fd = -1;
1064 - raw_close_fd_pool(s);  
1065 #ifdef DEBUG_FLOPPY 992 #ifdef DEBUG_FLOPPY
1066 printf("Floppy closed\n"); 993 printf("Floppy closed\n");
1067 #endif 994 #endif
@@ -1162,7 +1089,6 @@ static int raw_eject(BlockDriverState *bs, int eject_flag) @@ -1162,7 +1089,6 @@ static int raw_eject(BlockDriverState *bs, int eject_flag)
1162 if (s->fd >= 0) { 1089 if (s->fd >= 0) {
1163 close(s->fd); 1090 close(s->fd);
1164 s->fd = -1; 1091 s->fd = -1;
1165 - raw_close_fd_pool(s);  
1166 } 1092 }
1167 fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); 1093 fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
1168 if (fd >= 0) { 1094 if (fd >= 0) {
@@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = { @@ -1252,6 +1178,7 @@ BlockDriver bdrv_host_device = {
1252 .bdrv_aio_cancel = raw_aio_cancel, 1178 .bdrv_aio_cancel = raw_aio_cancel,
1253 .aiocb_size = sizeof(RawAIOCB), 1179 .aiocb_size = sizeof(RawAIOCB),
1254 #endif 1180 #endif
  1181 +
1255 .bdrv_pread = raw_pread, 1182 .bdrv_pread = raw_pread,
1256 .bdrv_pwrite = raw_pwrite, 1183 .bdrv_pwrite = raw_pwrite,
1257 .bdrv_getlength = raw_getlength, 1184 .bdrv_getlength = raw_getlength,
configure
@@ -149,7 +149,6 @@ FreeBSD) @@ -149,7 +149,6 @@ FreeBSD)
149 bsd="yes" 149 bsd="yes"
150 audio_drv_list="oss" 150 audio_drv_list="oss"
151 audio_possible_drivers="oss sdl esd pa" 151 audio_possible_drivers="oss sdl esd pa"
152 -aio_lib="-lpthread"  
153 if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then 152 if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
154 kqemu="yes" 153 kqemu="yes"
155 fi 154 fi
@@ -159,7 +158,6 @@ bsd=&quot;yes&quot; @@ -159,7 +158,6 @@ bsd=&quot;yes&quot;
159 audio_drv_list="oss" 158 audio_drv_list="oss"
160 audio_possible_drivers="oss sdl esd" 159 audio_possible_drivers="oss sdl esd"
161 oss_lib="-lossaudio" 160 oss_lib="-lossaudio"
162 -aio_lib="-lrt -lpthread"  
163 ;; 161 ;;
164 OpenBSD) 162 OpenBSD)
165 bsd="yes" 163 bsd="yes"
@@ -167,7 +165,6 @@ openbsd=&quot;yes&quot; @@ -167,7 +165,6 @@ openbsd=&quot;yes&quot;
167 audio_drv_list="oss" 165 audio_drv_list="oss"
168 audio_possible_drivers="oss sdl esd" 166 audio_possible_drivers="oss sdl esd"
169 oss_lib="-lossaudio" 167 oss_lib="-lossaudio"
170 -aio_lib="-lpthread"  
171 ;; 168 ;;
172 Darwin) 169 Darwin)
173 bsd="yes" 170 bsd="yes"
@@ -178,7 +175,6 @@ audio_drv_list=&quot;coreaudio&quot; @@ -178,7 +175,6 @@ audio_drv_list=&quot;coreaudio&quot;
178 audio_possible_drivers="coreaudio sdl fmod" 175 audio_possible_drivers="coreaudio sdl fmod"
179 OS_CFLAGS="-mdynamic-no-pic" 176 OS_CFLAGS="-mdynamic-no-pic"
180 OS_LDFLAGS="-framework CoreFoundation -framework IOKit" 177 OS_LDFLAGS="-framework CoreFoundation -framework IOKit"
181 -aio_lib="-lpthread"  
182 ;; 178 ;;
183 SunOS) 179 SunOS)
184 solaris="yes" 180 solaris="yes"
@@ -527,15 +523,6 @@ if test &quot;$mingw32&quot; = &quot;yes&quot; ; then @@ -527,15 +523,6 @@ if test &quot;$mingw32&quot; = &quot;yes&quot; ; then
527 bsd_user="no" 523 bsd_user="no"
528 fi 524 fi
529 525
530 -if [ "$darwin" = "yes" -o "$mingw32" = "yes" ] ; then  
531 - AIOLIBS=  
532 -elif [ "$bsd" = "yes" ]; then  
533 - AIOLIBS="$aio_lib"  
534 -else  
535 - # Some Linux architectures (e.g. s390) don't imply -lpthread automatically.  
536 - AIOLIBS="-lrt -lpthread"  
537 -fi  
538 -  
539 if test ! -x "$(which cgcc 2>/dev/null)"; then 526 if test ! -x "$(which cgcc 2>/dev/null)"; then
540 sparse="no" 527 sparse="no"
541 fi 528 fi
@@ -954,14 +941,17 @@ fi @@ -954,14 +941,17 @@ fi
954 941
955 ########################################## 942 ##########################################
956 # AIO probe 943 # AIO probe
  944 +AIOLIBS=""
  945 +
957 if test "$aio" = "yes" ; then 946 if test "$aio" = "yes" ; then
958 aio=no 947 aio=no
959 cat > $TMPC << EOF 948 cat > $TMPC << EOF
960 -#include <aio.h>  
961 -int main(void) { return aio_write(NULL); } 949 +#include <pthread.h>
  950 +int main(void) { pthread_mutex_t lock; return 0; }
962 EOF 951 EOF
963 if $cc $ARCH_CFLAGS -o $TMPE $AIOLIBS $TMPC 2> /dev/null ; then 952 if $cc $ARCH_CFLAGS -o $TMPE $AIOLIBS $TMPC 2> /dev/null ; then
964 aio=yes 953 aio=yes
  954 + AIOLIBS="-lpthread"
965 fi 955 fi
966 fi 956 fi
967 957
posix-aio-compat.c 0 → 100644
  1 +/*
  2 + * QEMU posix-aio emulation
  3 + *
  4 + * Copyright IBM, Corp. 2008
  5 + *
  6 + * Authors:
  7 + * Anthony Liguori <aliguori@us.ibm.com>
  8 + *
  9 + * This work is licensed under the terms of the GNU GPL, version 2. See
  10 + * the COPYING file in the top-level directory.
  11 + *
  12 + */
  13 +
  14 +#include <pthread.h>
  15 +#include <unistd.h>
  16 +#include <errno.h>
  17 +#include <sys/time.h>
  18 +#include "osdep.h"
  19 +
  20 +#include "posix-aio-compat.h"
  21 +
  22 +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
  23 +static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
  24 +static pthread_t thread_id;
  25 +static int max_threads = 64;
  26 +static int cur_threads = 0;
  27 +static int idle_threads = 0;
  28 +static TAILQ_HEAD(, qemu_paiocb) request_list;
  29 +
  30 +static void *aio_thread(void *unused)
  31 +{
  32 + sigset_t set;
  33 +
  34 + /* block all signals */
  35 + sigfillset(&set);
  36 + sigprocmask(SIG_BLOCK, &set, NULL);
  37 +
  38 + while (1) {
  39 + struct qemu_paiocb *aiocb;
  40 + size_t offset;
  41 + int ret = 0;
  42 +
  43 + pthread_mutex_lock(&lock);
  44 +
  45 + while (TAILQ_EMPTY(&request_list) &&
  46 + !(ret == ETIMEDOUT)) {
  47 + struct timespec ts = { 0 };
  48 + qemu_timeval tv;
  49 +
  50 + qemu_gettimeofday(&tv);
  51 + ts.tv_sec = tv.tv_sec + 10;
  52 + ret = pthread_cond_timedwait(&cond, &lock, &ts);
  53 + }
  54 +
  55 + if (ret == ETIMEDOUT)
  56 + break;
  57 +
  58 + aiocb = TAILQ_FIRST(&request_list);
  59 + TAILQ_REMOVE(&request_list, aiocb, node);
  60 +
  61 + offset = 0;
  62 + aiocb->active = 1;
  63 +
  64 + idle_threads--;
  65 + pthread_mutex_unlock(&lock);
  66 +
  67 + while (offset < aiocb->aio_nbytes) {
  68 + ssize_t len;
  69 +
  70 + if (aiocb->is_write)
  71 + len = pwrite(aiocb->aio_fildes,
  72 + (const char *)aiocb->aio_buf + offset,
  73 + aiocb->aio_nbytes - offset,
  74 + aiocb->aio_offset + offset);
  75 + else
  76 + len = pread(aiocb->aio_fildes,
  77 + (char *)aiocb->aio_buf + offset,
  78 + aiocb->aio_nbytes - offset,
  79 + aiocb->aio_offset + offset);
  80 +
  81 + if (len == -1 && errno == EINTR)
  82 + continue;
  83 + else if (len == -1) {
  84 + pthread_mutex_lock(&lock);
  85 + aiocb->ret = -errno;
  86 + pthread_mutex_unlock(&lock);
  87 + break;
  88 + } else if (len == 0)
  89 + break;
  90 +
  91 + offset += len;
  92 +
  93 + pthread_mutex_lock(&lock);
  94 + aiocb->ret = offset;
  95 + pthread_mutex_unlock(&lock);
  96 + }
  97 +
  98 + pthread_mutex_lock(&lock);
  99 + idle_threads++;
  100 + pthread_mutex_unlock(&lock);
  101 +
  102 + sigqueue(getpid(),
  103 + aiocb->aio_sigevent.sigev_signo,
  104 + aiocb->aio_sigevent.sigev_value);
  105 + }
  106 +
  107 + idle_threads--;
  108 + cur_threads--;
  109 + pthread_mutex_unlock(&lock);
  110 +
  111 + return NULL;
  112 +}
  113 +
  114 +static int spawn_thread(void)
  115 +{
  116 + pthread_attr_t attr;
  117 + int ret;
  118 +
  119 + cur_threads++;
  120 + idle_threads++;
  121 +
  122 + pthread_attr_init(&attr);
  123 + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
  124 + ret = pthread_create(&thread_id, &attr, aio_thread, NULL);
  125 + pthread_attr_destroy(&attr);
  126 +
  127 + return ret;
  128 +}
  129 +
  130 +int qemu_paio_init(struct qemu_paioinit *aioinit)
  131 +{
  132 + TAILQ_INIT(&request_list);
  133 +
  134 + return 0;
  135 +}
  136 +
  137 +static int qemu_paio_submit(struct qemu_paiocb *aiocb, int is_write)
  138 +{
  139 + aiocb->is_write = is_write;
  140 + aiocb->ret = -EINPROGRESS;
  141 + aiocb->active = 0;
  142 + pthread_mutex_lock(&lock);
  143 + if (idle_threads == 0 && cur_threads < max_threads)
  144 + spawn_thread();
  145 + TAILQ_INSERT_TAIL(&request_list, aiocb, node);
  146 + pthread_mutex_unlock(&lock);
  147 + pthread_cond_broadcast(&cond);
  148 +
  149 + return 0;
  150 +}
  151 +
  152 +int qemu_paio_read(struct qemu_paiocb *aiocb)
  153 +{
  154 + return qemu_paio_submit(aiocb, 0);
  155 +}
  156 +
  157 +int qemu_paio_write(struct qemu_paiocb *aiocb)
  158 +{
  159 + return qemu_paio_submit(aiocb, 1);
  160 +}
  161 +
  162 +ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
  163 +{
  164 + ssize_t ret;
  165 +
  166 + pthread_mutex_lock(&lock);
  167 + ret = aiocb->ret;
  168 + pthread_mutex_unlock(&lock);
  169 +
  170 + return ret;
  171 +}
  172 +
  173 +int qemu_paio_error(struct qemu_paiocb *aiocb)
  174 +{
  175 + ssize_t ret = qemu_paio_return(aiocb);
  176 +
  177 + if (ret < 0)
  178 + ret = -ret;
  179 + else
  180 + ret = 0;
  181 +
  182 + return ret;
  183 +}
  184 +
  185 +int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb)
  186 +{
  187 + int ret;
  188 +
  189 + pthread_mutex_lock(&lock);
  190 + if (!aiocb->active) {
  191 + TAILQ_REMOVE(&request_list, aiocb, node);
  192 + aiocb->ret = -ECANCELED;
  193 + ret = QEMU_PAIO_CANCELED;
  194 + } else if (aiocb->ret == -EINPROGRESS)
  195 + ret = QEMU_PAIO_NOTCANCELED;
  196 + else
  197 + ret = QEMU_PAIO_ALLDONE;
  198 + pthread_mutex_unlock(&lock);
  199 +
  200 + return ret;
  201 +}
  202 +
posix-aio-compat.h 0 → 100644
  1 +/*
  2 + * QEMU posix-aio emulation
  3 + *
  4 + * Copyright IBM, Corp. 2008
  5 + *
  6 + * Authors:
  7 + * Anthony Liguori <aliguori@us.ibm.com>
  8 + *
  9 + * This work is licensed under the terms of the GNU GPL, version 2. See
  10 + * the COPYING file in the top-level directory.
  11 + *
  12 + */
  13 +
  14 +#ifndef QEMU_POSIX_AIO_COMPAT_H
  15 +#define QEMU_POSIX_AIO_COMPAT_H
  16 +
  17 +#include <sys/types.h>
  18 +#include <unistd.h>
  19 +#include <signal.h>
  20 +
  21 +#include "sys-queue.h"
  22 +
  23 +#define QEMU_PAIO_CANCELED 0x01
  24 +#define QEMU_PAIO_NOTCANCELED 0x02
  25 +#define QEMU_PAIO_ALLDONE 0x03
  26 +
  27 +struct qemu_paiocb
  28 +{
  29 + int aio_fildes;
  30 + void *aio_buf;
  31 + size_t aio_nbytes;
  32 + struct sigevent aio_sigevent;
  33 + off_t aio_offset;
  34 +
  35 + /* private */
  36 + TAILQ_ENTRY(qemu_paiocb) node;
  37 + int is_write;
  38 + ssize_t ret;
  39 + int active;
  40 +};
  41 +
  42 +struct qemu_paioinit
  43 +{
  44 + unsigned int aio_threads;
  45 + unsigned int aio_num;
  46 + unsigned int aio_idle_time;
  47 +};
  48 +
  49 +int qemu_paio_init(struct qemu_paioinit *aioinit);
  50 +int qemu_paio_read(struct qemu_paiocb *aiocb);
  51 +int qemu_paio_write(struct qemu_paiocb *aiocb);
  52 +int qemu_paio_error(struct qemu_paiocb *aiocb);
  53 +ssize_t qemu_paio_return(struct qemu_paiocb *aiocb);
  54 +int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb);
  55 +
  56 +#endif