Commit bed5cc520707ba4382444c4fb2afd428df080e6c

Authored by bellard
1 parent 0ac087f1

Align file accesses with cache=off (O_DIRECT) (Kevin Wolf, Laurent Vivier)

git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4599 c046a42c-6fe2-441c-8c8c-71466251a162
Showing 1 changed file with 238 additions and 2 deletions
block-raw-posix.c
... ... @@ -70,6 +70,8 @@
70 70 #define FTYPE_CD 1
71 71 #define FTYPE_FD 2
72 72  
  73 +#define ALIGNED_BUFFER_SIZE (32 * 512)
  74 +
73 75 /* if the FD is not accessed during that time (in ms), we try to
74 76 reopen it to see if the disk has been changed */
75 77 #define FD_OPEN_TIMEOUT 1000
... ... @@ -86,6 +88,9 @@ typedef struct BDRVRawState {
86 88 int fd_got_error;
87 89 int fd_media_changed;
88 90 #endif
  91 +#if defined(O_DIRECT) && !defined(QEMU_IMG)
  92 + uint8_t* aligned_buf;
  93 +#endif
89 94 } BDRVRawState;
90 95  
91 96 static int fd_open(BlockDriverState *bs);
... ... @@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
121 126 return ret;
122 127 }
123 128 s->fd = fd;
  129 +#if defined(O_DIRECT) && !defined(QEMU_IMG)
  130 + s->aligned_buf = NULL;
  131 + if (flags & BDRV_O_DIRECT) {
  132 + s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE);
  133 + if (s->aligned_buf == NULL) {
  134 + ret = -errno;
  135 + close(fd);
  136 + return ret;
  137 + }
  138 + }
  139 +#endif
124 140 return 0;
125 141 }
126 142  
... ... @@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
141 157 #endif
142 158 */
143 159  
144   -static int raw_pread(BlockDriverState *bs, int64_t offset,
  160 +/*
  161 + * offset and count are in bytes, but must be multiples of 512 for files
  162 + * opened with O_DIRECT. buf must be aligned to 512 bytes then.
  163 + *
  164 + * This function may be called without alignment if the caller ensures
  165 + * that O_DIRECT is not in effect.
  166 + */
  167 +static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
145 168 uint8_t *buf, int count)
146 169 {
147 170 BDRVRawState *s = bs->opaque;
... ... @@ -194,7 +217,14 @@ label__raw_read__success:
194 217 return ret;
195 218 }
196 219  
197   -static int raw_pwrite(BlockDriverState *bs, int64_t offset,
  220 +/*
  221 + * offset and count are in bytes, but must be multiples of 512 for files
  222 + * opened with O_DIRECT. buf must be aligned to 512 bytes then.
  223 + *
  224 + * This function may be called without alignment if the caller ensures
  225 + * that O_DIRECT is not in effect.
  226 + */
  227 +static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
198 228 const uint8_t *buf, int count)
199 229 {
200 230 BDRVRawState *s = bs->opaque;
... ... @@ -230,6 +260,164 @@ label__raw_write__success:
230 260 return ret;
231 261 }
232 262  
  263 +
  264 +#if defined(O_DIRECT) && !defined(QEMU_IMG)
  265 +/*
  266 + * offset and count are in bytes and possibly not aligned. For files opened
  267 + * with O_DIRECT, necessary alignments are ensured before calling
  268 + * raw_pread_aligned to do the actual read.
  269 + */
  270 +static int raw_pread(BlockDriverState *bs, int64_t offset,
  271 + uint8_t *buf, int count)
  272 +{
  273 + BDRVRawState *s = bs->opaque;
  274 + int size, ret, shift, sum;
  275 +
  276 + sum = 0;
  277 +
  278 + if (s->aligned_buf != NULL) {
  279 +
  280 + if (offset & 0x1ff) {
  281 + /* align offset on a 512 bytes boundary */
  282 +
  283 + shift = offset & 0x1ff;
  284 + size = (shift + count + 0x1ff) & ~0x1ff;
  285 + if (size > ALIGNED_BUFFER_SIZE)
  286 + size = ALIGNED_BUFFER_SIZE;
  287 + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
  288 + if (ret < 0)
  289 + return ret;
  290 +
  291 + size = 512 - shift;
  292 + if (size > count)
  293 + size = count;
  294 + memcpy(buf, s->aligned_buf + shift, size);
  295 +
  296 + buf += size;
  297 + offset += size;
  298 + count -= size;
  299 + sum += size;
  300 +
  301 + if (count == 0)
  302 + return sum;
  303 + }
  304 + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
  305 +
  306 + /* read on aligned buffer */
  307 +
  308 + while (count) {
  309 +
  310 + size = (count + 0x1ff) & ~0x1ff;
  311 + if (size > ALIGNED_BUFFER_SIZE)
  312 + size = ALIGNED_BUFFER_SIZE;
  313 +
  314 + ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
  315 + if (ret < 0)
  316 + return ret;
  317 +
  318 + size = ret;
  319 + if (size > count)
  320 + size = count;
  321 +
  322 + memcpy(buf, s->aligned_buf, size);
  323 +
  324 + buf += size;
  325 + offset += size;
  326 + count -= size;
  327 + sum += size;
  328 + }
  329 +
  330 + return sum;
  331 + }
  332 + }
  333 +
  334 + return raw_pread_aligned(bs, offset, buf, count) + sum;
  335 +}
  336 +
  337 +/*
  338 + * offset and count are in bytes and possibly not aligned. For files opened
  339 + * with O_DIRECT, necessary alignments are ensured before calling
  340 + * raw_pwrite_aligned to do the actual write.
  341 + */
  342 +static int raw_pwrite(BlockDriverState *bs, int64_t offset,
  343 + const uint8_t *buf, int count)
  344 +{
  345 + BDRVRawState *s = bs->opaque;
  346 + int size, ret, shift, sum;
  347 +
  348 + sum = 0;
  349 +
  350 + if (s->aligned_buf != NULL) {
  351 +
  352 + if (offset & 0x1ff) {
  353 + /* align offset on a 512 bytes boundary */
  354 + shift = offset & 0x1ff;
  355 + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
  356 + if (ret < 0)
  357 + return ret;
  358 +
  359 + size = 512 - shift;
  360 + if (size > count)
  361 + size = count;
  362 + memcpy(s->aligned_buf + shift, buf, size);
  363 +
  364 + ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
  365 + if (ret < 0)
  366 + return ret;
  367 +
  368 + buf += size;
  369 + offset += size;
  370 + count -= size;
  371 + sum += size;
  372 +
  373 + if (count == 0)
  374 + return sum;
  375 + }
  376 + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
  377 +
  378 + while ((size = (count & ~0x1ff)) != 0) {
  379 +
  380 + if (size > ALIGNED_BUFFER_SIZE)
  381 + size = ALIGNED_BUFFER_SIZE;
  382 +
  383 + memcpy(s->aligned_buf, buf, size);
  384 +
  385 + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size);
  386 + if (ret < 0)
  387 + return ret;
  388 +
  389 + buf += ret;
  390 + offset += ret;
  391 + count -= ret;
  392 + sum += ret;
  393 + }
  394 + /* here, count < 512 because (count & ~0x1ff) == 0 */
  395 + if (count) {
  396 + ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
  397 + if (ret < 0)
  398 + return ret;
  399 + memcpy(s->aligned_buf, buf, count);
  400 +
  401 + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
  402 + if (ret < 0)
  403 + return ret;
  404 + if (count < ret)
  405 + ret = count;
  406 +
  407 + sum += ret;
  408 + }
  409 + return sum;
  410 + }
  411 + }
  412 + return raw_pwrite_aligned(bs, offset, buf, count) + sum;
  413 +}
  414 +
  415 +#else
  416 +#define raw_pread raw_pread_aligned
  417 +#define raw_pwrite raw_pwrite_aligned
  418 +#endif
  419 +
  420 +
233 421 /***********************************************************/
234 422 /* Unix AIO using POSIX AIO */
235 423  
... ... @@ -237,6 +425,7 @@ typedef struct RawAIOCB {
237 425 BlockDriverAIOCB common;
238 426 struct aiocb aiocb;
239 427 struct RawAIOCB *next;
  428 + int ret;
240 429 } RawAIOCB;
241 430  
242 431 static int aio_sig_num = SIGUSR2;
... ... @@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
397 586 return acb;
398 587 }
399 588  
  589 +#ifndef QEMU_IMG
  590 +static void raw_aio_em_cb(void* opaque)
  591 +{
  592 + RawAIOCB *acb = opaque;
  593 + acb->common.cb(acb->common.opaque, acb->ret);
  594 + qemu_aio_release(acb);
  595 +}
  596 +#endif
  597 +
400 598 static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs,
401 599 int64_t sector_num, uint8_t *buf, int nb_sectors,
402 600 BlockDriverCompletionFunc *cb, void *opaque)
403 601 {
404 602 RawAIOCB *acb;
405 603  
  604 + /*
  605 + * If O_DIRECT is used and the buffer is not aligned fall back
  606 + * to synchronous IO.
  607 + */
  608 +#if defined(O_DIRECT) && !defined(QEMU_IMG)
  609 + BDRVRawState *s = bs->opaque;
  610 +
  611 + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) {
  612 + QEMUBH *bh;
  613 + acb = qemu_aio_get(bs, cb, opaque);
  614 + acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors);
  615 + bh = qemu_bh_new(raw_aio_em_cb, acb);
  616 + qemu_bh_schedule(bh);
  617 + return &acb->common;
  618 + }
  619 +#endif
  620 +
406 621 acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
407 622 if (!acb)
408 623 return NULL;
... ... @@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
419 634 {
420 635 RawAIOCB *acb;
421 636  
  637 + /*
  638 + * If O_DIRECT is used and the buffer is not aligned fall back
  639 + * to synchronous IO.
  640 + */
  641 +#if defined(O_DIRECT) && !defined(QEMU_IMG)
  642 + BDRVRawState *s = bs->opaque;
  643 +
  644 + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) {
  645 + QEMUBH *bh;
  646 + acb = qemu_aio_get(bs, cb, opaque);
  647 + acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors);
  648 + bh = qemu_bh_new(raw_aio_em_cb, acb);
  649 + qemu_bh_schedule(bh);
  650 + return &acb->common;
  651 + }
  652 +#endif
  653 +
422 654 acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
423 655 if (!acb)
424 656 return NULL;
... ... @@ -462,6 +694,10 @@ static void raw_close(BlockDriverState *bs)
462 694 if (s->fd >= 0) {
463 695 close(s->fd);
464 696 s->fd = -1;
  697 +#if defined(O_DIRECT) && !defined(QEMU_IMG)
  698 + if (s->aligned_buf != NULL)
  699 + qemu_free(s->aligned_buf);
  700 +#endif
465 701 }
466 702 }
467 703  
... ...