Commit bed5cc520707ba4382444c4fb2afd428df080e6c
1 parent
0ac087f1
Align file accesses with cache=off (O_DIRECT) (Kevin Wolf, Laurent Vivier)
git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4599 c046a42c-6fe2-441c-8c8c-71466251a162
Showing
1 changed file
with
238 additions
and
2 deletions
block-raw-posix.c
| ... | ... | @@ -70,6 +70,8 @@ |
| 70 | 70 | #define FTYPE_CD 1 |
| 71 | 71 | #define FTYPE_FD 2 |
| 72 | 72 | |
| 73 | +#define ALIGNED_BUFFER_SIZE (32 * 512) | |
| 74 | + | |
| 73 | 75 | /* if the FD is not accessed during that time (in ms), we try to |
| 74 | 76 | reopen it to see if the disk has been changed */ |
| 75 | 77 | #define FD_OPEN_TIMEOUT 1000 |
| ... | ... | @@ -86,6 +88,9 @@ typedef struct BDRVRawState { |
| 86 | 88 | int fd_got_error; |
| 87 | 89 | int fd_media_changed; |
| 88 | 90 | #endif |
| 91 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
| 92 | + uint8_t* aligned_buf; | |
| 93 | +#endif | |
| 89 | 94 | } BDRVRawState; |
| 90 | 95 | |
| 91 | 96 | static int fd_open(BlockDriverState *bs); |
| ... | ... | @@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) |
| 121 | 126 | return ret; |
| 122 | 127 | } |
| 123 | 128 | s->fd = fd; |
| 129 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
| 130 | + s->aligned_buf = NULL; | |
| 131 | + if (flags & BDRV_O_DIRECT) { | |
| 132 | + s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE); | |
| 133 | + if (s->aligned_buf == NULL) { | |
| 134 | + ret = -errno; | |
| 135 | + close(fd); | |
| 136 | + return ret; | |
| 137 | + } | |
| 138 | + } | |
| 139 | +#endif | |
| 124 | 140 | return 0; |
| 125 | 141 | } |
| 126 | 142 | |
| ... | ... | @@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) |
| 141 | 157 | #endif |
| 142 | 158 | */ |
| 143 | 159 | |
| 144 | -static int raw_pread(BlockDriverState *bs, int64_t offset, | |
| 160 | +/* | |
| 161 | + * offset and count are in bytes, but must be multiples of 512 for files | |
| 162 | + * opened with O_DIRECT. buf must be aligned to 512 bytes then. | |
| 163 | + * | |
| 164 | + * This function may be called without alignment if the caller ensures | |
| 165 | + * that O_DIRECT is not in effect. | |
| 166 | + */ | |
| 167 | +static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, | |
| 145 | 168 | uint8_t *buf, int count) |
| 146 | 169 | { |
| 147 | 170 | BDRVRawState *s = bs->opaque; |
| ... | ... | @@ -194,7 +217,14 @@ label__raw_read__success: |
| 194 | 217 | return ret; |
| 195 | 218 | } |
| 196 | 219 | |
| 197 | -static int raw_pwrite(BlockDriverState *bs, int64_t offset, | |
| 220 | +/* | |
| 221 | + * offset and count are in bytes, but must be multiples of 512 for files | |
| 222 | + * opened with O_DIRECT. buf must be aligned to 512 bytes then. | |
| 223 | + * | |
| 224 | + * This function may be called without alignment if the caller ensures | |
| 225 | + * that O_DIRECT is not in effect. | |
| 226 | + */ | |
| 227 | +static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset, | |
| 198 | 228 | const uint8_t *buf, int count) |
| 199 | 229 | { |
| 200 | 230 | BDRVRawState *s = bs->opaque; |
| ... | ... | @@ -230,6 +260,164 @@ label__raw_write__success: |
| 230 | 260 | return ret; |
| 231 | 261 | } |
| 232 | 262 | |
| 263 | + | |
| 264 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
| 265 | +/* | |
| 266 | + * offset and count are in bytes and possibly not aligned. For files opened | |
| 267 | + * with O_DIRECT, necessary alignments are ensured before calling | |
| 268 | + * raw_pread_aligned to do the actual read. | |
| 269 | + */ | |
| 270 | +static int raw_pread(BlockDriverState *bs, int64_t offset, | |
| 271 | + uint8_t *buf, int count) | |
| 272 | +{ | |
| 273 | + BDRVRawState *s = bs->opaque; | |
| 274 | + int size, ret, shift, sum; | |
| 275 | + | |
| 276 | + sum = 0; | |
| 277 | + | |
| 278 | + if (s->aligned_buf != NULL) { | |
| 279 | + | |
| 280 | + if (offset & 0x1ff) { | |
| 281 | + /* align offset on a 512 bytes boundary */ | |
| 282 | + | |
| 283 | + shift = offset & 0x1ff; | |
| 284 | + size = (shift + count + 0x1ff) & ~0x1ff; | |
| 285 | + if (size > ALIGNED_BUFFER_SIZE) | |
| 286 | + size = ALIGNED_BUFFER_SIZE; | |
| 287 | + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size); | |
| 288 | + if (ret < 0) | |
| 289 | + return ret; | |
| 290 | + | |
| 291 | + size = 512 - shift; | |
| 292 | + if (size > count) | |
| 293 | + size = count; | |
| 294 | + memcpy(buf, s->aligned_buf + shift, size); | |
| 295 | + | |
| 296 | + buf += size; | |
| 297 | + offset += size; | |
| 298 | + count -= size; | |
| 299 | + sum += size; | |
| 300 | + | |
| 301 | + if (count == 0) | |
| 302 | + return sum; | |
| 303 | + } | |
| 304 | + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { | |
| 305 | + | |
| 306 | + /* read on aligned buffer */ | |
| 307 | + | |
| 308 | + while (count) { | |
| 309 | + | |
| 310 | + size = (count + 0x1ff) & ~0x1ff; | |
| 311 | + if (size > ALIGNED_BUFFER_SIZE) | |
| 312 | + size = ALIGNED_BUFFER_SIZE; | |
| 313 | + | |
| 314 | + ret = raw_pread_aligned(bs, offset, s->aligned_buf, size); | |
| 315 | + if (ret < 0) | |
| 316 | + return ret; | |
| 317 | + | |
| 318 | + size = ret; | |
| 319 | + if (size > count) | |
| 320 | + size = count; | |
| 321 | + | |
| 322 | + memcpy(buf, s->aligned_buf, size); | |
| 323 | + | |
| 324 | + buf += size; | |
| 325 | + offset += size; | |
| 326 | + count -= size; | |
| 327 | + sum += size; | |
| 328 | + } | |
| 329 | + | |
| 330 | + return sum; | |
| 331 | + } | |
| 332 | + } | |
| 333 | + | |
| 334 | + return raw_pread_aligned(bs, offset, buf, count) + sum; | |
| 335 | +} | |
| 336 | + | |
| 337 | +/* | |
| 338 | + * offset and count are in bytes and possibly not aligned. For files opened | |
| 339 | + * with O_DIRECT, necessary alignments are ensured before calling | |
| 340 | + * raw_pwrite_aligned to do the actual write. | |
| 341 | + */ | |
| 342 | +static int raw_pwrite(BlockDriverState *bs, int64_t offset, | |
| 343 | + const uint8_t *buf, int count) | |
| 344 | +{ | |
| 345 | + BDRVRawState *s = bs->opaque; | |
| 346 | + int size, ret, shift, sum; | |
| 347 | + | |
| 348 | + sum = 0; | |
| 349 | + | |
| 350 | + if (s->aligned_buf != NULL) { | |
| 351 | + | |
| 352 | + if (offset & 0x1ff) { | |
| 353 | + /* align offset on a 512 bytes boundary */ | |
| 354 | + shift = offset & 0x1ff; | |
| 355 | + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512); | |
| 356 | + if (ret < 0) | |
| 357 | + return ret; | |
| 358 | + | |
| 359 | + size = 512 - shift; | |
| 360 | + if (size > count) | |
| 361 | + size = count; | |
| 362 | + memcpy(s->aligned_buf + shift, buf, size); | |
| 363 | + | |
| 364 | + ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512); | |
| 365 | + if (ret < 0) | |
| 366 | + return ret; | |
| 367 | + | |
| 368 | + buf += size; | |
| 369 | + offset += size; | |
| 370 | + count -= size; | |
| 371 | + sum += size; | |
| 372 | + | |
| 373 | + if (count == 0) | |
| 374 | + return sum; | |
| 375 | + } | |
| 376 | + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { | |
| 377 | + | |
| 378 | + while ((size = (count & ~0x1ff)) != 0) { | |
| 379 | + | |
| 380 | + if (size > ALIGNED_BUFFER_SIZE) | |
| 381 | + size = ALIGNED_BUFFER_SIZE; | |
| 382 | + | |
| 383 | + memcpy(s->aligned_buf, buf, size); | |
| 384 | + | |
| 385 | + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size); | |
| 386 | + if (ret < 0) | |
| 387 | + return ret; | |
| 388 | + | |
| 389 | + buf += ret; | |
| 390 | + offset += ret; | |
| 391 | + count -= ret; | |
| 392 | + sum += ret; | |
| 393 | + } | |
| 394 | + /* here, count < 512 because (count & ~0x1ff) == 0 */ | |
| 395 | + if (count) { | |
| 396 | + ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512); | |
| 397 | + if (ret < 0) | |
| 398 | + return ret; | |
| 399 | + memcpy(s->aligned_buf, buf, count); | |
| 400 | + | |
| 401 | + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512); | |
| 402 | + if (ret < 0) | |
| 403 | + return ret; | |
| 404 | + if (count < ret) | |
| 405 | + ret = count; | |
| 406 | + | |
| 407 | + sum += ret; | |
| 408 | + } | |
| 409 | + return sum; | |
| 410 | + } | |
| 411 | + } | |
| 412 | + return raw_pwrite_aligned(bs, offset, buf, count) + sum; | |
| 413 | +} | |
| 414 | + | |
| 415 | +#else | |
| 416 | +#define raw_pread raw_pread_aligned | |
| 417 | +#define raw_pwrite raw_pwrite_aligned | |
| 418 | +#endif | |
| 419 | + | |
| 420 | + | |
| 233 | 421 | /***********************************************************/ |
| 234 | 422 | /* Unix AIO using POSIX AIO */ |
| 235 | 423 | |
| ... | ... | @@ -237,6 +425,7 @@ typedef struct RawAIOCB { |
| 237 | 425 | BlockDriverAIOCB common; |
| 238 | 426 | struct aiocb aiocb; |
| 239 | 427 | struct RawAIOCB *next; |
| 428 | + int ret; | |
| 240 | 429 | } RawAIOCB; |
| 241 | 430 | |
| 242 | 431 | static int aio_sig_num = SIGUSR2; |
| ... | ... | @@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, |
| 397 | 586 | return acb; |
| 398 | 587 | } |
| 399 | 588 | |
| 589 | +#ifndef QEMU_IMG | |
| 590 | +static void raw_aio_em_cb(void* opaque) | |
| 591 | +{ | |
| 592 | + RawAIOCB *acb = opaque; | |
| 593 | + acb->common.cb(acb->common.opaque, acb->ret); | |
| 594 | + qemu_aio_release(acb); | |
| 595 | +} | |
| 596 | +#endif | |
| 597 | + | |
| 400 | 598 | static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, |
| 401 | 599 | int64_t sector_num, uint8_t *buf, int nb_sectors, |
| 402 | 600 | BlockDriverCompletionFunc *cb, void *opaque) |
| 403 | 601 | { |
| 404 | 602 | RawAIOCB *acb; |
| 405 | 603 | |
| 604 | + /* | |
| 605 | + * If O_DIRECT is used and the buffer is not aligned fall back | |
| 606 | + * to synchronous IO. | |
| 607 | + */ | |
| 608 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
| 609 | + BDRVRawState *s = bs->opaque; | |
| 610 | + | |
| 611 | + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { | |
| 612 | + QEMUBH *bh; | |
| 613 | + acb = qemu_aio_get(bs, cb, opaque); | |
| 614 | + acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors); | |
| 615 | + bh = qemu_bh_new(raw_aio_em_cb, acb); | |
| 616 | + qemu_bh_schedule(bh); | |
| 617 | + return &acb->common; | |
| 618 | + } | |
| 619 | +#endif | |
| 620 | + | |
| 406 | 621 | acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); |
| 407 | 622 | if (!acb) |
| 408 | 623 | return NULL; |
| ... | ... | @@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, |
| 419 | 634 | { |
| 420 | 635 | RawAIOCB *acb; |
| 421 | 636 | |
| 637 | + /* | |
| 638 | + * If O_DIRECT is used and the buffer is not aligned fall back | |
| 639 | + * to synchronous IO. | |
| 640 | + */ | |
| 641 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
| 642 | + BDRVRawState *s = bs->opaque; | |
| 643 | + | |
| 644 | + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { | |
| 645 | + QEMUBH *bh; | |
| 646 | + acb = qemu_aio_get(bs, cb, opaque); | |
| 647 | + acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors); | |
| 648 | + bh = qemu_bh_new(raw_aio_em_cb, acb); | |
| 649 | + qemu_bh_schedule(bh); | |
| 650 | + return &acb->common; | |
| 651 | + } | |
| 652 | +#endif | |
| 653 | + | |
| 422 | 654 | acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); |
| 423 | 655 | if (!acb) |
| 424 | 656 | return NULL; |
| ... | ... | @@ -462,6 +694,10 @@ static void raw_close(BlockDriverState *bs) |
| 462 | 694 | if (s->fd >= 0) { |
| 463 | 695 | close(s->fd); |
| 464 | 696 | s->fd = -1; |
| 697 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
| 698 | + if (s->aligned_buf != NULL) | |
| 699 | + qemu_free(s->aligned_buf); | |
| 700 | +#endif | |
| 465 | 701 | } |
| 466 | 702 | } |
| 467 | 703 | ... | ... |