Commit 5353872545861d8d21bf9fcc64a25cbfc8cd2eac
1 parent
997306fc
Implement an fd pool to get real AIO with posix-aio
This patch implements a simple fd pool to allow many AIO requests with posix-aio. The result is significantly improved performance (identical to that reported for linux-aio) for both cache=on and cache=off. The fundamental problem with posix-aio is that it limits itself to one thread per-file descriptor. I don't know why this is, but this patch provides a simple mechanism to work around this (duplicating the file descriptor). This isn't a great solution, but it seems like a reasonable intermediate step between posix-aio and a custom thread-pool to replace it. Ryan Harper will be posting some performance analysis he did comparing posix-aio with fd pooling against linux-aio. The size of the posix-aio thread pool and the fd pool were largely determined by him based on this analysis. Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5323 c046a42c-6fe2-441c-8c8c-71466251a162
Showing
1 changed file
with
65 additions
and
3 deletions
block-raw-posix.c
| @@ -84,10 +84,16 @@ | @@ -84,10 +84,16 @@ | ||
| 84 | reopen it to see if the disk has been changed */ | 84 | reopen it to see if the disk has been changed */ |
| 85 | #define FD_OPEN_TIMEOUT 1000 | 85 | #define FD_OPEN_TIMEOUT 1000 |
| 86 | 86 | ||
| 87 | +/* posix-aio doesn't allow multiple outstanding requests to a single file | ||
| 88 | + * descriptor. we implement a pool of dup()'d file descriptors to work | ||
| 89 | + * around this */ | ||
| 90 | +#define RAW_FD_POOL_SIZE 64 | ||
| 91 | + | ||
| 87 | typedef struct BDRVRawState { | 92 | typedef struct BDRVRawState { |
| 88 | int fd; | 93 | int fd; |
| 89 | int type; | 94 | int type; |
| 90 | unsigned int lseek_err_cnt; | 95 | unsigned int lseek_err_cnt; |
| 96 | + int fd_pool[RAW_FD_POOL_SIZE]; | ||
| 91 | #if defined(__linux__) | 97 | #if defined(__linux__) |
| 92 | /* linux floppy specific */ | 98 | /* linux floppy specific */ |
| 93 | int fd_open_flags; | 99 | int fd_open_flags; |
| @@ -109,6 +115,7 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | @@ -109,6 +115,7 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | ||
| 109 | { | 115 | { |
| 110 | BDRVRawState *s = bs->opaque; | 116 | BDRVRawState *s = bs->opaque; |
| 111 | int fd, open_flags, ret; | 117 | int fd, open_flags, ret; |
| 118 | + int i; | ||
| 112 | 119 | ||
| 113 | posix_aio_init(); | 120 | posix_aio_init(); |
| 114 | 121 | ||
| @@ -138,6 +145,8 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | @@ -138,6 +145,8 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | ||
| 138 | return ret; | 145 | return ret; |
| 139 | } | 146 | } |
| 140 | s->fd = fd; | 147 | s->fd = fd; |
| 148 | + for (i = 0; i < RAW_FD_POOL_SIZE; i++) | ||
| 149 | + s->fd_pool[i] = -1; | ||
| 141 | #if defined(O_DIRECT) | 150 | #if defined(O_DIRECT) |
| 142 | s->aligned_buf = NULL; | 151 | s->aligned_buf = NULL; |
| 143 | if (flags & BDRV_O_DIRECT) { | 152 | if (flags & BDRV_O_DIRECT) { |
| @@ -436,6 +445,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset, | @@ -436,6 +445,7 @@ static int raw_pwrite(BlockDriverState *bs, int64_t offset, | ||
| 436 | 445 | ||
| 437 | typedef struct RawAIOCB { | 446 | typedef struct RawAIOCB { |
| 438 | BlockDriverAIOCB common; | 447 | BlockDriverAIOCB common; |
| 448 | + int fd; | ||
| 439 | struct aiocb aiocb; | 449 | struct aiocb aiocb; |
| 440 | struct RawAIOCB *next; | 450 | struct RawAIOCB *next; |
| 441 | int ret; | 451 | int ret; |
| @@ -447,6 +457,38 @@ typedef struct PosixAioState | @@ -447,6 +457,38 @@ typedef struct PosixAioState | ||
| 447 | RawAIOCB *first_aio; | 457 | RawAIOCB *first_aio; |
| 448 | } PosixAioState; | 458 | } PosixAioState; |
| 449 | 459 | ||
| 460 | +static int raw_fd_pool_get(BDRVRawState *s) | ||
| 461 | +{ | ||
| 462 | + int i; | ||
| 463 | + | ||
| 464 | + for (i = 0; i < RAW_FD_POOL_SIZE; i++) { | ||
| 465 | + /* already in use */ | ||
| 466 | + if (s->fd_pool[i] != -1) | ||
| 467 | + continue; | ||
| 468 | + | ||
| 469 | + /* try to dup file descriptor */ | ||
| 470 | + s->fd_pool[i] = dup(s->fd); | ||
| 471 | + if (s->fd_pool[i] != -1) | ||
| 472 | + return s->fd_pool[i]; | ||
| 473 | + } | ||
| 474 | + | ||
| 475 | + /* we couldn't dup the file descriptor so just use the main one */ | ||
| 476 | + return s->fd; | ||
| 477 | +} | ||
| 478 | + | ||
| 479 | +static void raw_fd_pool_put(RawAIOCB *acb) | ||
| 480 | +{ | ||
| 481 | + BDRVRawState *s = acb->common.bs->opaque; | ||
| 482 | + int i; | ||
| 483 | + | ||
| 484 | + for (i = 0; i < RAW_FD_POOL_SIZE; i++) { | ||
| 485 | + if (s->fd_pool[i] == acb->fd) { | ||
| 486 | + close(s->fd_pool[i]); | ||
| 487 | + s->fd_pool[i] = -1; | ||
| 488 | + } | ||
| 489 | + } | ||
| 490 | +} | ||
| 491 | + | ||
| 450 | static void posix_aio_read(void *opaque) | 492 | static void posix_aio_read(void *opaque) |
| 451 | { | 493 | { |
| 452 | PosixAioState *s = opaque; | 494 | PosixAioState *s = opaque; |
| @@ -487,6 +529,7 @@ static void posix_aio_read(void *opaque) | @@ -487,6 +529,7 @@ static void posix_aio_read(void *opaque) | ||
| 487 | if (ret == ECANCELED) { | 529 | if (ret == ECANCELED) { |
| 488 | /* remove the request */ | 530 | /* remove the request */ |
| 489 | *pacb = acb->next; | 531 | *pacb = acb->next; |
| 532 | + raw_fd_pool_put(acb); | ||
| 490 | qemu_aio_release(acb); | 533 | qemu_aio_release(acb); |
| 491 | } else if (ret != EINPROGRESS) { | 534 | } else if (ret != EINPROGRESS) { |
| 492 | /* end of aio */ | 535 | /* end of aio */ |
| @@ -503,6 +546,7 @@ static void posix_aio_read(void *opaque) | @@ -503,6 +546,7 @@ static void posix_aio_read(void *opaque) | ||
| 503 | *pacb = acb->next; | 546 | *pacb = acb->next; |
| 504 | /* call the callback */ | 547 | /* call the callback */ |
| 505 | acb->common.cb(acb->common.opaque, ret); | 548 | acb->common.cb(acb->common.opaque, ret); |
| 549 | + raw_fd_pool_put(acb); | ||
| 506 | qemu_aio_release(acb); | 550 | qemu_aio_release(acb); |
| 507 | break; | 551 | break; |
| 508 | } else { | 552 | } else { |
| @@ -577,7 +621,8 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, | @@ -577,7 +621,8 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, | ||
| 577 | acb = qemu_aio_get(bs, cb, opaque); | 621 | acb = qemu_aio_get(bs, cb, opaque); |
| 578 | if (!acb) | 622 | if (!acb) |
| 579 | return NULL; | 623 | return NULL; |
| 580 | - acb->aiocb.aio_fildes = s->fd; | 624 | + acb->fd = raw_fd_pool_get(s); |
| 625 | + acb->aiocb.aio_fildes = acb->fd; | ||
| 581 | acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; | 626 | acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; |
| 582 | acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; | 627 | acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; |
| 583 | acb->aiocb.aio_buf = buf; | 628 | acb->aiocb.aio_buf = buf; |
| @@ -684,6 +729,7 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) | @@ -684,6 +729,7 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) | ||
| 684 | break; | 729 | break; |
| 685 | } else if (*pacb == acb) { | 730 | } else if (*pacb == acb) { |
| 686 | *pacb = acb->next; | 731 | *pacb = acb->next; |
| 732 | + raw_fd_pool_put(acb); | ||
| 687 | qemu_aio_release(acb); | 733 | qemu_aio_release(acb); |
| 688 | break; | 734 | break; |
| 689 | } | 735 | } |
| @@ -697,6 +743,18 @@ static int posix_aio_init(void) | @@ -697,6 +743,18 @@ static int posix_aio_init(void) | ||
| 697 | } | 743 | } |
| 698 | #endif /* CONFIG_AIO */ | 744 | #endif /* CONFIG_AIO */ |
| 699 | 745 | ||
| 746 | +static void raw_close_fd_pool(BDRVRawState *s) | ||
| 747 | +{ | ||
| 748 | + int i; | ||
| 749 | + | ||
| 750 | + for (i = 0; i < RAW_FD_POOL_SIZE; i++) { | ||
| 751 | + if (s->fd_pool[i] != -1) { | ||
| 752 | + close(s->fd_pool[i]); | ||
| 753 | + s->fd_pool[i] = -1; | ||
| 754 | + } | ||
| 755 | + } | ||
| 756 | +} | ||
| 757 | + | ||
| 700 | static void raw_close(BlockDriverState *bs) | 758 | static void raw_close(BlockDriverState *bs) |
| 701 | { | 759 | { |
| 702 | BDRVRawState *s = bs->opaque; | 760 | BDRVRawState *s = bs->opaque; |
| @@ -708,6 +766,7 @@ static void raw_close(BlockDriverState *bs) | @@ -708,6 +766,7 @@ static void raw_close(BlockDriverState *bs) | ||
| 708 | qemu_free(s->aligned_buf); | 766 | qemu_free(s->aligned_buf); |
| 709 | #endif | 767 | #endif |
| 710 | } | 768 | } |
| 769 | + raw_close_fd_pool(s); | ||
| 711 | } | 770 | } |
| 712 | 771 | ||
| 713 | static int raw_truncate(BlockDriverState *bs, int64_t offset) | 772 | static int raw_truncate(BlockDriverState *bs, int64_t offset) |
| @@ -898,7 +957,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma | @@ -898,7 +957,7 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma | ||
| 898 | static int hdev_open(BlockDriverState *bs, const char *filename, int flags) | 957 | static int hdev_open(BlockDriverState *bs, const char *filename, int flags) |
| 899 | { | 958 | { |
| 900 | BDRVRawState *s = bs->opaque; | 959 | BDRVRawState *s = bs->opaque; |
| 901 | - int fd, open_flags, ret; | 960 | + int fd, open_flags, ret, i; |
| 902 | 961 | ||
| 903 | posix_aio_init(); | 962 | posix_aio_init(); |
| 904 | 963 | ||
| @@ -963,6 +1022,8 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) | @@ -963,6 +1022,8 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) | ||
| 963 | return ret; | 1022 | return ret; |
| 964 | } | 1023 | } |
| 965 | s->fd = fd; | 1024 | s->fd = fd; |
| 1025 | + for (i = 0; i < RAW_FD_POOL_SIZE; i++) | ||
| 1026 | + s->fd_pool[i] = -1; | ||
| 966 | #if defined(__linux__) | 1027 | #if defined(__linux__) |
| 967 | /* close fd so that we can reopen it as needed */ | 1028 | /* close fd so that we can reopen it as needed */ |
| 968 | if (s->type == FTYPE_FD) { | 1029 | if (s->type == FTYPE_FD) { |
| @@ -975,7 +1036,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) | @@ -975,7 +1036,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) | ||
| 975 | } | 1036 | } |
| 976 | 1037 | ||
| 977 | #if defined(__linux__) | 1038 | #if defined(__linux__) |
| 978 | - | ||
| 979 | /* Note: we do not have a reliable method to detect if the floppy is | 1039 | /* Note: we do not have a reliable method to detect if the floppy is |
| 980 | present. The current method is to try to open the floppy at every | 1040 | present. The current method is to try to open the floppy at every |
| 981 | I/O and to keep it opened during a few hundreds of ms. */ | 1041 | I/O and to keep it opened during a few hundreds of ms. */ |
| @@ -991,6 +1051,7 @@ static int fd_open(BlockDriverState *bs) | @@ -991,6 +1051,7 @@ static int fd_open(BlockDriverState *bs) | ||
| 991 | (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { | 1051 | (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { |
| 992 | close(s->fd); | 1052 | close(s->fd); |
| 993 | s->fd = -1; | 1053 | s->fd = -1; |
| 1054 | + raw_close_fd_pool(s); | ||
| 994 | #ifdef DEBUG_FLOPPY | 1055 | #ifdef DEBUG_FLOPPY |
| 995 | printf("Floppy closed\n"); | 1056 | printf("Floppy closed\n"); |
| 996 | #endif | 1057 | #endif |
| @@ -1091,6 +1152,7 @@ static int raw_eject(BlockDriverState *bs, int eject_flag) | @@ -1091,6 +1152,7 @@ static int raw_eject(BlockDriverState *bs, int eject_flag) | ||
| 1091 | if (s->fd >= 0) { | 1152 | if (s->fd >= 0) { |
| 1092 | close(s->fd); | 1153 | close(s->fd); |
| 1093 | s->fd = -1; | 1154 | s->fd = -1; |
| 1155 | + raw_close_fd_pool(s); | ||
| 1094 | } | 1156 | } |
| 1095 | fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); | 1157 | fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); |
| 1096 | if (fd >= 0) { | 1158 | if (fd >= 0) { |