Commit 5353872545861d8d21bf9fcc64a25cbfc8cd2eac

Authored by aliguori
1 parent 997306fc

Implement an fd pool to get real AIO with posix-aio

This patch implements a simple fd pool to allow many AIO requests with
posix-aio.  The result is significantly improved performance (identical to that
reported for linux-aio) for both cache=on and cache=off.

The fundamental problem with posix-aio is that it limits itself to one thread
per-file descriptor.  I don't know why this is, but this patch provides a simple
mechanism to work around this (duplicating the file descriptor).

This isn't a great solution, but it seems like a reasonable intermediate step
between posix-aio and a custom thread-pool to replace it.

Ryan Harper will be posting some performance analysis he did comparing posix-aio
with fd pooling against linux-aio.  The size of the posix-aio thread pool and
the fd pool were largely determined by him based on this analysis.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>



git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5323 c046a42c-6fe2-441c-8c8c-71466251a162
Showing 1 changed file with 65 additions and 3 deletions
block-raw-posix.c
@@ -84,10 +84,16 @@ @@ -84,10 +84,16 @@
84 reopen it to see if the disk has been changed */ 84 reopen it to see if the disk has been changed */
85 #define FD_OPEN_TIMEOUT 1000 85 #define FD_OPEN_TIMEOUT 1000
86 86
  87 +/* posix-aio doesn't allow multiple outstanding requests to a single file
  88 + * descriptor. we implement a pool of dup()'d file descriptors to work
  89 + * around this */
  90 +#define RAW_FD_POOL_SIZE 64
  91 +
87 typedef struct BDRVRawState { 92 typedef struct BDRVRawState {
88 int fd; 93 int fd;
89 int type; 94 int type;
90 unsigned int lseek_err_cnt; 95 unsigned int lseek_err_cnt;
  96 + int fd_pool[RAW_FD_POOL_SIZE];
91 #if defined(__linux__) 97 #if defined(__linux__)
92 /* linux floppy specific */ 98 /* linux floppy specific */
93 int fd_open_flags; 99 int fd_open_flags;
@@ -109,6 +115,7 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) @@ -109,6 +115,7 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
109 { 115 {
110 BDRVRawState *s = bs->opaque; 116 BDRVRawState *s = bs->opaque;
111 int fd, open_flags, ret; 117 int fd, open_flags, ret;
  118 + int i;
112 119
113 posix_aio_init(); 120 posix_aio_init();
114 121
@@ -138,6 +145,8 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) @@ -138,6 +145,8 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
138 return ret; 145 return ret;
139 } 146 }
140 s->fd = fd; 147 s->fd = fd;
  148 + for (i = 0; i < RAW_FD_POOL_SIZE; i++)
  149 + s->fd_pool[i] = -1;
141 #if defined(O_DIRECT) 150 #if defined(O_DIRECT)
142 s->aligned_buf = NULL; 151 s->aligned_buf = NULL;
143 if (flags & BDRV_O_DIRECT) { 152 if (flags & BDRV_O_DIRECT) {
@@ -436,6 +445,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset, @@ -436,6 +445,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset,
436 445
437 typedef struct RawAIOCB { 446 typedef struct RawAIOCB {
438 BlockDriverAIOCB common; 447 BlockDriverAIOCB common;
  448 + int fd;
439 struct aiocb aiocb; 449 struct aiocb aiocb;
440 struct RawAIOCB *next; 450 struct RawAIOCB *next;
441 int ret; 451 int ret;
@@ -447,6 +457,38 @@ typedef struct PosixAioState @@ -447,6 +457,38 @@ typedef struct PosixAioState
447 RawAIOCB *first_aio; 457 RawAIOCB *first_aio;
448 } PosixAioState; 458 } PosixAioState;
449 459
  460 +static int raw_fd_pool_get(BDRVRawState *s)
  461 +{
  462 + int i;
  463 +
  464 + for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
  465 + /* already in use */
  466 + if (s->fd_pool[i] != -1)
  467 + continue;
  468 +
  469 + /* try to dup file descriptor */
  470 + s->fd_pool[i] = dup(s->fd);
  471 + if (s->fd_pool[i] != -1)
  472 + return s->fd_pool[i];
  473 + }
  474 +
  475 + /* we couldn't dup the file descriptor so just use the main one */
  476 + return s->fd;
  477 +}
  478 +
  479 +static void raw_fd_pool_put(RawAIOCB *acb)
  480 +{
  481 + BDRVRawState *s = acb->common.bs->opaque;
  482 + int i;
  483 +
  484 + for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
  485 + if (s->fd_pool[i] == acb->fd) {
  486 + close(s->fd_pool[i]);
  487 + s->fd_pool[i] = -1;
  488 + }
  489 + }
  490 +}
  491 +
450 static void posix_aio_read(void *opaque) 492 static void posix_aio_read(void *opaque)
451 { 493 {
452 PosixAioState *s = opaque; 494 PosixAioState *s = opaque;
@@ -487,6 +529,7 @@ static void posix_aio_read(void *opaque) @@ -487,6 +529,7 @@ static void posix_aio_read(void *opaque)
487 if (ret == ECANCELED) { 529 if (ret == ECANCELED) {
488 /* remove the request */ 530 /* remove the request */
489 *pacb = acb->next; 531 *pacb = acb->next;
  532 + raw_fd_pool_put(acb);
490 qemu_aio_release(acb); 533 qemu_aio_release(acb);
491 } else if (ret != EINPROGRESS) { 534 } else if (ret != EINPROGRESS) {
492 /* end of aio */ 535 /* end of aio */
@@ -503,6 +546,7 @@ static void posix_aio_read(void *opaque) @@ -503,6 +546,7 @@ static void posix_aio_read(void *opaque)
503 *pacb = acb->next; 546 *pacb = acb->next;
504 /* call the callback */ 547 /* call the callback */
505 acb->common.cb(acb->common.opaque, ret); 548 acb->common.cb(acb->common.opaque, ret);
  549 + raw_fd_pool_put(acb);
506 qemu_aio_release(acb); 550 qemu_aio_release(acb);
507 break; 551 break;
508 } else { 552 } else {
@@ -577,7 +621,8 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, @@ -577,7 +621,8 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
577 acb = qemu_aio_get(bs, cb, opaque); 621 acb = qemu_aio_get(bs, cb, opaque);
578 if (!acb) 622 if (!acb)
579 return NULL; 623 return NULL;
580 - acb->aiocb.aio_fildes = s->fd; 624 + acb->fd = raw_fd_pool_get(s);
  625 + acb->aiocb.aio_fildes = acb->fd;
581 acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; 626 acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
582 acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; 627 acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
583 acb->aiocb.aio_buf = buf; 628 acb->aiocb.aio_buf = buf;
@@ -684,6 +729,7 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) @@ -684,6 +729,7 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
684 break; 729 break;
685 } else if (*pacb == acb) { 730 } else if (*pacb == acb) {
686 *pacb = acb->next; 731 *pacb = acb->next;
  732 + raw_fd_pool_put(acb);
687 qemu_aio_release(acb); 733 qemu_aio_release(acb);
688 break; 734 break;
689 } 735 }
@@ -697,6 +743,18 @@ static int posix_aio_init(void) @@ -697,6 +743,18 @@ static int posix_aio_init(void)
697 } 743 }
698 #endif /* CONFIG_AIO */ 744 #endif /* CONFIG_AIO */
699 745
  746 +static void raw_close_fd_pool(BDRVRawState *s)
  747 +{
  748 + int i;
  749 +
  750 + for (i = 0; i < RAW_FD_POOL_SIZE; i++) {
  751 + if (s->fd_pool[i] != -1) {
  752 + close(s->fd_pool[i]);
  753 + s->fd_pool[i] = -1;
  754 + }
  755 + }
  756 +}
  757 +
700 static void raw_close(BlockDriverState *bs) 758 static void raw_close(BlockDriverState *bs)
701 { 759 {
702 BDRVRawState *s = bs->opaque; 760 BDRVRawState *s = bs->opaque;
@@ -708,6 +766,7 @@ static void raw_close(BlockDriverState *bs) @@ -708,6 +766,7 @@ static void raw_close(BlockDriverState *bs)
708 qemu_free(s->aligned_buf); 766 qemu_free(s->aligned_buf);
709 #endif 767 #endif
710 } 768 }
  769 + raw_close_fd_pool(s);
711 } 770 }
712 771
713 static int raw_truncate(BlockDriverState *bs, int64_t offset) 772 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -898,7 +957,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma @@ -898,7 +957,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma
898 static int hdev_open(BlockDriverState *bs, const char *filename, int flags) 957 static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
899 { 958 {
900 BDRVRawState *s = bs->opaque; 959 BDRVRawState *s = bs->opaque;
901 - int fd, open_flags, ret; 960 + int fd, open_flags, ret, i;
902 961
903 posix_aio_init(); 962 posix_aio_init();
904 963
@@ -963,6 +1022,8 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) @@ -963,6 +1022,8 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
963 return ret; 1022 return ret;
964 } 1023 }
965 s->fd = fd; 1024 s->fd = fd;
  1025 + for (i = 0; i < RAW_FD_POOL_SIZE; i++)
  1026 + s->fd_pool[i] = -1;
966 #if defined(__linux__) 1027 #if defined(__linux__)
967 /* close fd so that we can reopen it as needed */ 1028 /* close fd so that we can reopen it as needed */
968 if (s->type == FTYPE_FD) { 1029 if (s->type == FTYPE_FD) {
@@ -975,7 +1036,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) @@ -975,7 +1036,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
975 } 1036 }
976 1037
977 #if defined(__linux__) 1038 #if defined(__linux__)
978 -  
979 /* Note: we do not have a reliable method to detect if the floppy is 1039 /* Note: we do not have a reliable method to detect if the floppy is
980 present. The current method is to try to open the floppy at every 1040 present. The current method is to try to open the floppy at every
981 I/O and to keep it opened during a few hundreds of ms. */ 1041 I/O and to keep it opened during a few hundreds of ms. */
@@ -991,6 +1051,7 @@ static int fd_open(BlockDriverState *bs) @@ -991,6 +1051,7 @@ static int fd_open(BlockDriverState *bs)
991 (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { 1051 (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
992 close(s->fd); 1052 close(s->fd);
993 s->fd = -1; 1053 s->fd = -1;
  1054 + raw_close_fd_pool(s);
994 #ifdef DEBUG_FLOPPY 1055 #ifdef DEBUG_FLOPPY
995 printf("Floppy closed\n"); 1056 printf("Floppy closed\n");
996 #endif 1057 #endif
@@ -1091,6 +1152,7 @@ static int raw_eject(BlockDriverState *bs, int eject_flag) @@ -1091,6 +1152,7 @@ static int raw_eject(BlockDriverState *bs, int eject_flag)
1091 if (s->fd >= 0) { 1152 if (s->fd >= 0) {
1092 close(s->fd); 1153 close(s->fd);
1093 s->fd = -1; 1154 s->fd = -1;
  1155 + raw_close_fd_pool(s);
1094 } 1156 }
1095 fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); 1157 fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
1096 if (fd >= 0) { 1158 if (fd >= 0) {