Commit bed5cc520707ba4382444c4fb2afd428df080e6c
1 parent
0ac087f1
Align file accesses with cache=off (O_DIRECT) (Kevin Wolf, Laurent Vivier)
git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@4599 c046a42c-6fe2-441c-8c8c-71466251a162
Showing
1 changed file
with
238 additions
and
2 deletions
block-raw-posix.c
... | ... | @@ -70,6 +70,8 @@ |
70 | 70 | #define FTYPE_CD 1 |
71 | 71 | #define FTYPE_FD 2 |
72 | 72 | |
73 | +#define ALIGNED_BUFFER_SIZE (32 * 512) | |
74 | + | |
73 | 75 | /* if the FD is not accessed during that time (in ms), we try to |
74 | 76 | reopen it to see if the disk has been changed */ |
75 | 77 | #define FD_OPEN_TIMEOUT 1000 |
... | ... | @@ -86,6 +88,9 @@ typedef struct BDRVRawState { |
86 | 88 | int fd_got_error; |
87 | 89 | int fd_media_changed; |
88 | 90 | #endif |
91 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
92 | + uint8_t* aligned_buf; | |
93 | +#endif | |
89 | 94 | } BDRVRawState; |
90 | 95 | |
91 | 96 | static int fd_open(BlockDriverState *bs); |
... | ... | @@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) |
121 | 126 | return ret; |
122 | 127 | } |
123 | 128 | s->fd = fd; |
129 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
130 | + s->aligned_buf = NULL; | |
131 | + if (flags & BDRV_O_DIRECT) { | |
132 | + s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE); | |
133 | + if (s->aligned_buf == NULL) { | |
134 | + ret = -errno; | |
135 | + close(fd); | |
136 | + return ret; | |
137 | + } | |
138 | + } | |
139 | +#endif | |
124 | 140 | return 0; |
125 | 141 | } |
126 | 142 | |
... | ... | @@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) |
141 | 157 | #endif |
142 | 158 | */ |
143 | 159 | |
144 | -static int raw_pread(BlockDriverState *bs, int64_t offset, | |
160 | +/* | |
161 | + * offset and count are in bytes, but must be multiples of 512 for files | |
162 | + * opened with O_DIRECT. buf must be aligned to 512 bytes then. | |
163 | + * | |
164 | + * This function may be called without alignment if the caller ensures | |
165 | + * that O_DIRECT is not in effect. | |
166 | + */ | |
167 | +static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, | |
145 | 168 | uint8_t *buf, int count) |
146 | 169 | { |
147 | 170 | BDRVRawState *s = bs->opaque; |
... | ... | @@ -194,7 +217,14 @@ label__raw_read__success: |
194 | 217 | return ret; |
195 | 218 | } |
196 | 219 | |
197 | -static int raw_pwrite(BlockDriverState *bs, int64_t offset, | |
220 | +/* | |
221 | + * offset and count are in bytes, but must be multiples of 512 for files | |
222 | + * opened with O_DIRECT. buf must be aligned to 512 bytes then. | |
223 | + * | |
224 | + * This function may be called without alignment if the caller ensures | |
225 | + * that O_DIRECT is not in effect. | |
226 | + */ | |
227 | +static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset, | |
198 | 228 | const uint8_t *buf, int count) |
199 | 229 | { |
200 | 230 | BDRVRawState *s = bs->opaque; |
... | ... | @@ -230,6 +260,164 @@ label__raw_write__success: |
230 | 260 | return ret; |
231 | 261 | } |
232 | 262 | |
263 | + | |
264 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
265 | +/* | |
266 | + * offset and count are in bytes and possibly not aligned. For files opened | |
267 | + * with O_DIRECT, necessary alignments are ensured before calling | |
268 | + * raw_pread_aligned to do the actual read. | |
269 | + */ | |
270 | +static int raw_pread(BlockDriverState *bs, int64_t offset, | |
271 | + uint8_t *buf, int count) | |
272 | +{ | |
273 | + BDRVRawState *s = bs->opaque; | |
274 | + int size, ret, shift, sum; | |
275 | + | |
276 | + sum = 0; | |
277 | + | |
278 | + if (s->aligned_buf != NULL) { | |
279 | + | |
280 | + if (offset & 0x1ff) { | |
281 | + /* align offset on a 512 bytes boundary */ | |
282 | + | |
283 | + shift = offset & 0x1ff; | |
284 | + size = (shift + count + 0x1ff) & ~0x1ff; | |
285 | + if (size > ALIGNED_BUFFER_SIZE) | |
286 | + size = ALIGNED_BUFFER_SIZE; | |
287 | + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size); | |
288 | + if (ret < 0) | |
289 | + return ret; | |
290 | + | |
291 | + size = 512 - shift; | |
292 | + if (size > count) | |
293 | + size = count; | |
294 | + memcpy(buf, s->aligned_buf + shift, size); | |
295 | + | |
296 | + buf += size; | |
297 | + offset += size; | |
298 | + count -= size; | |
299 | + sum += size; | |
300 | + | |
301 | + if (count == 0) | |
302 | + return sum; | |
303 | + } | |
304 | + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { | |
305 | + | |
306 | + /* read on aligned buffer */ | |
307 | + | |
308 | + while (count) { | |
309 | + | |
310 | + size = (count + 0x1ff) & ~0x1ff; | |
311 | + if (size > ALIGNED_BUFFER_SIZE) | |
312 | + size = ALIGNED_BUFFER_SIZE; | |
313 | + | |
314 | + ret = raw_pread_aligned(bs, offset, s->aligned_buf, size); | |
315 | + if (ret < 0) | |
316 | + return ret; | |
317 | + | |
318 | + size = ret; | |
319 | + if (size > count) | |
320 | + size = count; | |
321 | + | |
322 | + memcpy(buf, s->aligned_buf, size); | |
323 | + | |
324 | + buf += size; | |
325 | + offset += size; | |
326 | + count -= size; | |
327 | + sum += size; | |
328 | + } | |
329 | + | |
330 | + return sum; | |
331 | + } | |
332 | + } | |
333 | + | |
334 | + return raw_pread_aligned(bs, offset, buf, count) + sum; | |
335 | +} | |
336 | + | |
337 | +/* | |
338 | + * offset and count are in bytes and possibly not aligned. For files opened | |
339 | + * with O_DIRECT, necessary alignments are ensured before calling | |
340 | + * raw_pwrite_aligned to do the actual write. | |
341 | + */ | |
342 | +static int raw_pwrite(BlockDriverState *bs, int64_t offset, | |
343 | + const uint8_t *buf, int count) | |
344 | +{ | |
345 | + BDRVRawState *s = bs->opaque; | |
346 | + int size, ret, shift, sum; | |
347 | + | |
348 | + sum = 0; | |
349 | + | |
350 | + if (s->aligned_buf != NULL) { | |
351 | + | |
352 | + if (offset & 0x1ff) { | |
353 | + /* align offset on a 512 bytes boundary */ | |
354 | + shift = offset & 0x1ff; | |
355 | + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512); | |
356 | + if (ret < 0) | |
357 | + return ret; | |
358 | + | |
359 | + size = 512 - shift; | |
360 | + if (size > count) | |
361 | + size = count; | |
362 | + memcpy(s->aligned_buf + shift, buf, size); | |
363 | + | |
364 | + ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512); | |
365 | + if (ret < 0) | |
366 | + return ret; | |
367 | + | |
368 | + buf += size; | |
369 | + offset += size; | |
370 | + count -= size; | |
371 | + sum += size; | |
372 | + | |
373 | + if (count == 0) | |
374 | + return sum; | |
375 | + } | |
376 | + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { | |
377 | + | |
378 | + while ((size = (count & ~0x1ff)) != 0) { | |
379 | + | |
380 | + if (size > ALIGNED_BUFFER_SIZE) | |
381 | + size = ALIGNED_BUFFER_SIZE; | |
382 | + | |
383 | + memcpy(s->aligned_buf, buf, size); | |
384 | + | |
385 | + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size); | |
386 | + if (ret < 0) | |
387 | + return ret; | |
388 | + | |
389 | + buf += ret; | |
390 | + offset += ret; | |
391 | + count -= ret; | |
392 | + sum += ret; | |
393 | + } | |
394 | + /* here, count < 512 because (count & ~0x1ff) == 0 */ | |
395 | + if (count) { | |
396 | + ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512); | |
397 | + if (ret < 0) | |
398 | + return ret; | |
399 | + memcpy(s->aligned_buf, buf, count); | |
400 | + | |
401 | + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512); | |
402 | + if (ret < 0) | |
403 | + return ret; | |
404 | + if (count < ret) | |
405 | + ret = count; | |
406 | + | |
407 | + sum += ret; | |
408 | + } | |
409 | + return sum; | |
410 | + } | |
411 | + } | |
412 | + return raw_pwrite_aligned(bs, offset, buf, count) + sum; | |
413 | +} | |
414 | + | |
415 | +#else | |
416 | +#define raw_pread raw_pread_aligned | |
417 | +#define raw_pwrite raw_pwrite_aligned | |
418 | +#endif | |
419 | + | |
420 | + | |
233 | 421 | /***********************************************************/ |
234 | 422 | /* Unix AIO using POSIX AIO */ |
235 | 423 | |
... | ... | @@ -237,6 +425,7 @@ typedef struct RawAIOCB { |
237 | 425 | BlockDriverAIOCB common; |
238 | 426 | struct aiocb aiocb; |
239 | 427 | struct RawAIOCB *next; |
428 | + int ret; | |
240 | 429 | } RawAIOCB; |
241 | 430 | |
242 | 431 | static int aio_sig_num = SIGUSR2; |
... | ... | @@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, |
397 | 586 | return acb; |
398 | 587 | } |
399 | 588 | |
589 | +#ifndef QEMU_IMG | |
590 | +static void raw_aio_em_cb(void* opaque) | |
591 | +{ | |
592 | + RawAIOCB *acb = opaque; | |
593 | + acb->common.cb(acb->common.opaque, acb->ret); | |
594 | + qemu_aio_release(acb); | |
595 | +} | |
596 | +#endif | |
597 | + | |
400 | 598 | static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, |
401 | 599 | int64_t sector_num, uint8_t *buf, int nb_sectors, |
402 | 600 | BlockDriverCompletionFunc *cb, void *opaque) |
403 | 601 | { |
404 | 602 | RawAIOCB *acb; |
405 | 603 | |
604 | + /* | |
605 | + * If O_DIRECT is used and the buffer is not aligned fall back | |
606 | + * to synchronous IO. | |
607 | + */ | |
608 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
609 | + BDRVRawState *s = bs->opaque; | |
610 | + | |
611 | + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { | |
612 | + QEMUBH *bh; | |
613 | + acb = qemu_aio_get(bs, cb, opaque); | |
614 | + acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors); | |
615 | + bh = qemu_bh_new(raw_aio_em_cb, acb); | |
616 | + qemu_bh_schedule(bh); | |
617 | + return &acb->common; | |
618 | + } | |
619 | +#endif | |
620 | + | |
406 | 621 | acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); |
407 | 622 | if (!acb) |
408 | 623 | return NULL; |
... | ... | @@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, |
419 | 634 | { |
420 | 635 | RawAIOCB *acb; |
421 | 636 | |
637 | + /* | |
638 | + * If O_DIRECT is used and the buffer is not aligned fall back | |
639 | + * to synchronous IO. | |
640 | + */ | |
641 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
642 | + BDRVRawState *s = bs->opaque; | |
643 | + | |
644 | + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { | |
645 | + QEMUBH *bh; | |
646 | + acb = qemu_aio_get(bs, cb, opaque); | |
647 | + acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors); | |
648 | + bh = qemu_bh_new(raw_aio_em_cb, acb); | |
649 | + qemu_bh_schedule(bh); | |
650 | + return &acb->common; | |
651 | + } | |
652 | +#endif | |
653 | + | |
422 | 654 | acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); |
423 | 655 | if (!acb) |
424 | 656 | return NULL; |
... | ... | @@ -462,6 +694,10 @@ static void raw_close(BlockDriverState *bs) |
462 | 694 | if (s->fd >= 0) { |
463 | 695 | close(s->fd); |
464 | 696 | s->fd = -1; |
697 | +#if defined(O_DIRECT) && !defined(QEMU_IMG) | |
698 | + if (s->aligned_buf != NULL) | |
699 | + qemu_free(s->aligned_buf); | |
700 | +#endif | |
465 | 701 | } |
466 | 702 | } |
467 | 703 | ... | ... |