Commit bed5cc520707ba4382444c4fb2afd428df080e6c
1 parent
0ac087f1
Align file accesses with cache=off (O_DIRECT) (Kevin Wolf, Laurent Vivier)
git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4599 c046a42c-6fe2-441c-8c8c-71466251a162
Showing
1 changed file
with
238 additions
and
2 deletions
block-raw-posix.c
| @@ -70,6 +70,8 @@ | @@ -70,6 +70,8 @@ | ||
| 70 | #define FTYPE_CD 1 | 70 | #define FTYPE_CD 1 |
| 71 | #define FTYPE_FD 2 | 71 | #define FTYPE_FD 2 |
| 72 | 72 | ||
| 73 | +#define ALIGNED_BUFFER_SIZE (32 * 512) | ||
| 74 | + | ||
| 73 | /* if the FD is not accessed during that time (in ms), we try to | 75 | /* if the FD is not accessed during that time (in ms), we try to |
| 74 | reopen it to see if the disk has been changed */ | 76 | reopen it to see if the disk has been changed */ |
| 75 | #define FD_OPEN_TIMEOUT 1000 | 77 | #define FD_OPEN_TIMEOUT 1000 |
| @@ -86,6 +88,9 @@ typedef struct BDRVRawState { | @@ -86,6 +88,9 @@ typedef struct BDRVRawState { | ||
| 86 | int fd_got_error; | 88 | int fd_got_error; |
| 87 | int fd_media_changed; | 89 | int fd_media_changed; |
| 88 | #endif | 90 | #endif |
| 91 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | ||
| 92 | + uint8_t* aligned_buf; | ||
| 93 | +#endif | ||
| 89 | } BDRVRawState; | 94 | } BDRVRawState; |
| 90 | 95 | ||
| 91 | static int fd_open(BlockDriverState *bs); | 96 | static int fd_open(BlockDriverState *bs); |
| @@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | @@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | ||
| 121 | return ret; | 126 | return ret; |
| 122 | } | 127 | } |
| 123 | s->fd = fd; | 128 | s->fd = fd; |
| 129 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | ||
| 130 | + s->aligned_buf = NULL; | ||
| 131 | + if (flags & BDRV_O_DIRECT) { | ||
| 132 | + s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE); | ||
| 133 | + if (s->aligned_buf == NULL) { | ||
| 134 | + ret = -errno; | ||
| 135 | + close(fd); | ||
| 136 | + return ret; | ||
| 137 | + } | ||
| 138 | + } | ||
| 139 | +#endif | ||
| 124 | return 0; | 140 | return 0; |
| 125 | } | 141 | } |
| 126 | 142 | ||
| @@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | @@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) | ||
| 141 | #endif | 157 | #endif |
| 142 | */ | 158 | */ |
| 143 | 159 | ||
| 144 | -static int raw_pread(BlockDriverState *bs, int64_t offset, | 160 | +/* |
| 161 | + * offset and count are in bytes, but must be multiples of 512 for files | ||
| 162 | + * opened with O_DIRECT. buf must be aligned to 512 bytes then. | ||
| 163 | + * | ||
| 164 | + * This function may be called without alignment if the caller ensures | ||
| 165 | + * that O_DIRECT is not in effect. | ||
| 166 | + */ | ||
| 167 | +static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, | ||
| 145 | uint8_t *buf, int count) | 168 | uint8_t *buf, int count) |
| 146 | { | 169 | { |
| 147 | BDRVRawState *s = bs->opaque; | 170 | BDRVRawState *s = bs->opaque; |
| @@ -194,7 +217,14 @@ label__raw_read__success: | @@ -194,7 +217,14 @@ label__raw_read__success: | ||
| 194 | return ret; | 217 | return ret; |
| 195 | } | 218 | } |
| 196 | 219 | ||
| 197 | -static int raw_pwrite(BlockDriverState *bs, int64_t offset, | 220 | +/* |
| 221 | + * offset and count are in bytes, but must be multiples of 512 for files | ||
| 222 | + * opened with O_DIRECT. buf must be aligned to 512 bytes then. | ||
| 223 | + * | ||
| 224 | + * This function may be called without alignment if the caller ensures | ||
| 225 | + * that O_DIRECT is not in effect. | ||
| 226 | + */ | ||
| 227 | +static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset, | ||
| 198 | const uint8_t *buf, int count) | 228 | const uint8_t *buf, int count) |
| 199 | { | 229 | { |
| 200 | BDRVRawState *s = bs->opaque; | 230 | BDRVRawState *s = bs->opaque; |
| @@ -230,6 +260,164 @@ label__raw_write__success: | @@ -230,6 +260,164 @@ label__raw_write__success: | ||
| 230 | return ret; | 260 | return ret; |
| 231 | } | 261 | } |
| 232 | 262 | ||
| 263 | + | ||
| 264 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | ||
| 265 | +/* | ||
| 266 | + * offset and count are in bytes and possibly not aligned. For files opened | ||
| 267 | + * with O_DIRECT, necessary alignments are ensured before calling | ||
| 268 | + * raw_pread_aligned to do the actual read. | ||
| 269 | + */ | ||
| 270 | +static int raw_pread(BlockDriverState *bs, int64_t offset, | ||
| 271 | + uint8_t *buf, int count) | ||
| 272 | +{ | ||
| 273 | + BDRVRawState *s = bs->opaque; | ||
| 274 | + int size, ret, shift, sum; | ||
| 275 | + | ||
| 276 | + sum = 0; | ||
| 277 | + | ||
| 278 | + if (s->aligned_buf != NULL) { | ||
| 279 | + | ||
| 280 | + if (offset & 0x1ff) { | ||
| 281 | + /* align offset on a 512 bytes boundary */ | ||
| 282 | + | ||
| 283 | + shift = offset & 0x1ff; | ||
| 284 | + size = (shift + count + 0x1ff) & ~0x1ff; | ||
| 285 | + if (size > ALIGNED_BUFFER_SIZE) | ||
| 286 | + size = ALIGNED_BUFFER_SIZE; | ||
| 287 | + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size); | ||
| 288 | + if (ret < 0) | ||
| 289 | + return ret; | ||
| 290 | + | ||
| 291 | + size = 512 - shift; | ||
| 292 | + if (size > count) | ||
| 293 | + size = count; | ||
| 294 | + memcpy(buf, s->aligned_buf + shift, size); | ||
| 295 | + | ||
| 296 | + buf += size; | ||
| 297 | + offset += size; | ||
| 298 | + count -= size; | ||
| 299 | + sum += size; | ||
| 300 | + | ||
| 301 | + if (count == 0) | ||
| 302 | + return sum; | ||
| 303 | + } | ||
| 304 | + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { | ||
| 305 | + | ||
| 306 | + /* read on aligned buffer */ | ||
| 307 | + | ||
| 308 | + while (count) { | ||
| 309 | + | ||
| 310 | + size = (count + 0x1ff) & ~0x1ff; | ||
| 311 | + if (size > ALIGNED_BUFFER_SIZE) | ||
| 312 | + size = ALIGNED_BUFFER_SIZE; | ||
| 313 | + | ||
| 314 | + ret = raw_pread_aligned(bs, offset, s->aligned_buf, size); | ||
| 315 | + if (ret < 0) | ||
| 316 | + return ret; | ||
| 317 | + | ||
| 318 | + size = ret; | ||
| 319 | + if (size > count) | ||
| 320 | + size = count; | ||
| 321 | + | ||
| 322 | + memcpy(buf, s->aligned_buf, size); | ||
| 323 | + | ||
| 324 | + buf += size; | ||
| 325 | + offset += size; | ||
| 326 | + count -= size; | ||
| 327 | + sum += size; | ||
| 328 | + } | ||
| 329 | + | ||
| 330 | + return sum; | ||
| 331 | + } | ||
| 332 | + } | ||
| 333 | + | ||
| 334 | + return raw_pread_aligned(bs, offset, buf, count) + sum; | ||
| 335 | +} | ||
| 336 | + | ||
| 337 | +/* | ||
| 338 | + * offset and count are in bytes and possibly not aligned. For files opened | ||
| 339 | + * with O_DIRECT, necessary alignments are ensured before calling | ||
| 340 | + * raw_pwrite_aligned to do the actual write. | ||
| 341 | + */ | ||
| 342 | +static int raw_pwrite(BlockDriverState *bs, int64_t offset, | ||
| 343 | + const uint8_t *buf, int count) | ||
| 344 | +{ | ||
| 345 | + BDRVRawState *s = bs->opaque; | ||
| 346 | + int size, ret, shift, sum; | ||
| 347 | + | ||
| 348 | + sum = 0; | ||
| 349 | + | ||
| 350 | + if (s->aligned_buf != NULL) { | ||
| 351 | + | ||
| 352 | + if (offset & 0x1ff) { | ||
| 353 | + /* align offset on a 512 bytes boundary */ | ||
| 354 | + shift = offset & 0x1ff; | ||
| 355 | + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512); | ||
| 356 | + if (ret < 0) | ||
| 357 | + return ret; | ||
| 358 | + | ||
| 359 | + size = 512 - shift; | ||
| 360 | + if (size > count) | ||
| 361 | + size = count; | ||
| 362 | + memcpy(s->aligned_buf + shift, buf, size); | ||
| 363 | + | ||
| 364 | + ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512); | ||
| 365 | + if (ret < 0) | ||
| 366 | + return ret; | ||
| 367 | + | ||
| 368 | + buf += size; | ||
| 369 | + offset += size; | ||
| 370 | + count -= size; | ||
| 371 | + sum += size; | ||
| 372 | + | ||
| 373 | + if (count == 0) | ||
| 374 | + return sum; | ||
| 375 | + } | ||
| 376 | + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { | ||
| 377 | + | ||
| 378 | + while ((size = (count & ~0x1ff)) != 0) { | ||
| 379 | + | ||
| 380 | + if (size > ALIGNED_BUFFER_SIZE) | ||
| 381 | + size = ALIGNED_BUFFER_SIZE; | ||
| 382 | + | ||
| 383 | + memcpy(s->aligned_buf, buf, size); | ||
| 384 | + | ||
| 385 | + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size); | ||
| 386 | + if (ret < 0) | ||
| 387 | + return ret; | ||
| 388 | + | ||
| 389 | + buf += ret; | ||
| 390 | + offset += ret; | ||
| 391 | + count -= ret; | ||
| 392 | + sum += ret; | ||
| 393 | + } | ||
| 394 | + /* here, count < 512 because (count & ~0x1ff) == 0 */ | ||
| 395 | + if (count) { | ||
| 396 | + ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512); | ||
| 397 | + if (ret < 0) | ||
| 398 | + return ret; | ||
| 399 | + memcpy(s->aligned_buf, buf, count); | ||
| 400 | + | ||
| 401 | + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512); | ||
| 402 | + if (ret < 0) | ||
| 403 | + return ret; | ||
| 404 | + if (count < ret) | ||
| 405 | + ret = count; | ||
| 406 | + | ||
| 407 | + sum += ret; | ||
| 408 | + } | ||
| 409 | + return sum; | ||
| 410 | + } | ||
| 411 | + } | ||
| 412 | + return raw_pwrite_aligned(bs, offset, buf, count) + sum; | ||
| 413 | +} | ||
| 414 | + | ||
| 415 | +#else | ||
| 416 | +#define raw_pread raw_pread_aligned | ||
| 417 | +#define raw_pwrite raw_pwrite_aligned | ||
| 418 | +#endif | ||
| 419 | + | ||
| 420 | + | ||
| 233 | /***********************************************************/ | 421 | /***********************************************************/ |
| 234 | /* Unix AIO using POSIX AIO */ | 422 | /* Unix AIO using POSIX AIO */ |
| 235 | 423 | ||
| @@ -237,6 +425,7 @@ typedef struct RawAIOCB { | @@ -237,6 +425,7 @@ typedef struct RawAIOCB { | ||
| 237 | BlockDriverAIOCB common; | 425 | BlockDriverAIOCB common; |
| 238 | struct aiocb aiocb; | 426 | struct aiocb aiocb; |
| 239 | struct RawAIOCB *next; | 427 | struct RawAIOCB *next; |
| 428 | + int ret; | ||
| 240 | } RawAIOCB; | 429 | } RawAIOCB; |
| 241 | 430 | ||
| 242 | static int aio_sig_num = SIGUSR2; | 431 | static int aio_sig_num = SIGUSR2; |
| @@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, | @@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, | ||
| 397 | return acb; | 586 | return acb; |
| 398 | } | 587 | } |
| 399 | 588 | ||
| 589 | +#ifndef QEMU_IMG | ||
| 590 | +static void raw_aio_em_cb(void* opaque) | ||
| 591 | +{ | ||
| 592 | + RawAIOCB *acb = opaque; | ||
| 593 | + acb->common.cb(acb->common.opaque, acb->ret); | ||
| 594 | + qemu_aio_release(acb); | ||
| 595 | +} | ||
| 596 | +#endif | ||
| 597 | + | ||
| 400 | static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, | 598 | static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, |
| 401 | int64_t sector_num, uint8_t *buf, int nb_sectors, | 599 | int64_t sector_num, uint8_t *buf, int nb_sectors, |
| 402 | BlockDriverCompletionFunc *cb, void *opaque) | 600 | BlockDriverCompletionFunc *cb, void *opaque) |
| 403 | { | 601 | { |
| 404 | RawAIOCB *acb; | 602 | RawAIOCB *acb; |
| 405 | 603 | ||
| 604 | + /* | ||
| 605 | + * If O_DIRECT is used and the buffer is not aligned fall back | ||
| 606 | + * to synchronous IO. | ||
| 607 | + */ | ||
| 608 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | ||
| 609 | + BDRVRawState *s = bs->opaque; | ||
| 610 | + | ||
| 611 | + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { | ||
| 612 | + QEMUBH *bh; | ||
| 613 | + acb = qemu_aio_get(bs, cb, opaque); | ||
| 614 | + acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors); | ||
| 615 | + bh = qemu_bh_new(raw_aio_em_cb, acb); | ||
| 616 | + qemu_bh_schedule(bh); | ||
| 617 | + return &acb->common; | ||
| 618 | + } | ||
| 619 | +#endif | ||
| 620 | + | ||
| 406 | acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); | 621 | acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); |
| 407 | if (!acb) | 622 | if (!acb) |
| 408 | return NULL; | 623 | return NULL; |
| @@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, | @@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, | ||
| 419 | { | 634 | { |
| 420 | RawAIOCB *acb; | 635 | RawAIOCB *acb; |
| 421 | 636 | ||
| 637 | + /* | ||
| 638 | + * If O_DIRECT is used and the buffer is not aligned fall back | ||
| 639 | + * to synchronous IO. | ||
| 640 | + */ | ||
| 641 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | ||
| 642 | + BDRVRawState *s = bs->opaque; | ||
| 643 | + | ||
| 644 | + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { | ||
| 645 | + QEMUBH *bh; | ||
| 646 | + acb = qemu_aio_get(bs, cb, opaque); | ||
| 647 | + acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors); | ||
| 648 | + bh = qemu_bh_new(raw_aio_em_cb, acb); | ||
| 649 | + qemu_bh_schedule(bh); | ||
| 650 | + return &acb->common; | ||
| 651 | + } | ||
| 652 | +#endif | ||
| 653 | + | ||
| 422 | acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); | 654 | acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); |
| 423 | if (!acb) | 655 | if (!acb) |
| 424 | return NULL; | 656 | return NULL; |
| @@ -462,6 +694,10 @@ static void raw_close(BlockDriverState *bs) | @@ -462,6 +694,10 @@ static void raw_close(BlockDriverState *bs) | ||
| 462 | if (s->fd >= 0) { | 694 | if (s->fd >= 0) { |
| 463 | close(s->fd); | 695 | close(s->fd); |
| 464 | s->fd = -1; | 696 | s->fd = -1; |
| 697 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | ||
| 698 | + if (s->aligned_buf != NULL) | ||
| 699 | + qemu_free(s->aligned_buf); | ||
| 700 | +#endif | ||
| 465 | } | 701 | } |
| 466 | } | 702 | } |
| 467 | 703 |