Commit e976c6a1e40ad74d616a186d3b48b0ad8f5eb970

Authored by aliguori
1 parent 6db6c638

Change order of metadata update to prevent loosing guest data because of unexpec…

…ted exit (Gleb Natapov)

Currently the order is this (during cow since it's the interesting case):
1. Decrement refcount of old clusters
2. Increment refcount for newly allocated clusters
3. Copy content of old sectors that will not be rewritten
4. Update L2 table with pointers to new clusters
5. Write guest data into new clusters (asynchronously)

There are several problems with this order. The first one is that if qemu
crashes (or killed or host reboots) after new clusters are linked into L2
table but before user data is written there, then on the next reboot guest
will find neither old data nor new one in those sectors and this is not
what gust expects even when journaling file system is in use.  The other
problem is that if qemu is killed between steps 1 and 4 then refcount
of old cluster will be incorrect and may cause snapshot corruption.

The patch change the order to be like this:
1. Increment refcount for newly allocated clusters
2. Write guest data into new clusters (asynchronously)
3. Copy content of old sectors that were not rewritten
4. Update L2 table with pointers to new clusters
5. Decrement refcount of old clusters

Unexpected crash may cause cluster leakage, but guest data should be safe.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>



git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5861 c046a42c-6fe2-441c-8c8c-71466251a162
Showing 1 changed file with 92 additions and 63 deletions
block-qcow2.c
@@ -856,6 +856,70 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs, @@ -856,6 +856,70 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
856 return cluster_offset; 856 return cluster_offset;
857 } 857 }
858 858
  859 +typedef struct QCowL2Meta
  860 +{
  861 + uint64_t offset;
  862 + int n_start;
  863 + int nb_available;
  864 + int nb_clusters;
  865 +} QCowL2Meta;
  866 +
  867 +static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
  868 + QCowL2Meta *m)
  869 +{
  870 + BDRVQcowState *s = bs->opaque;
  871 + int i, j = 0, l2_index, ret;
  872 + uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
  873 +
  874 + if (m->nb_clusters == 0)
  875 + return 0;
  876 +
  877 + if (!(old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t))))
  878 + return -ENOMEM;
  879 +
  880 + /* copy content of unmodified sectors */
  881 + start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
  882 + if (m->n_start) {
  883 + ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
  884 + if (ret < 0)
  885 + goto err;
  886 + }
  887 +
  888 + if (m->nb_available & (s->cluster_sectors - 1)) {
  889 + uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
  890 + ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
  891 + m->nb_available - end, s->cluster_sectors);
  892 + if (ret < 0)
  893 + goto err;
  894 + }
  895 +
  896 + ret = -EIO;
  897 + /* update L2 table */
  898 + if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
  899 + goto err;
  900 +
  901 + for (i = 0; i < m->nb_clusters; i++) {
  902 + if(l2_table[l2_index + i] != 0)
  903 + old_cluster[j++] = l2_table[l2_index + i];
  904 +
  905 + l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
  906 + (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
  907 + }
  908 +
  909 + if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
  910 + l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
  911 + m->nb_clusters * sizeof(uint64_t))
  912 + goto err;
  913 +
  914 + for (i = 0; i < j; i++)
  915 + free_any_clusters(bs, old_cluster[i], 1);
  916 +
  917 + ret = 0;
  918 +err:
  919 + qemu_free(old_cluster);
  920 + return ret;
  921 + }
  922 +
859 /* 923 /*
860 * alloc_cluster_offset 924 * alloc_cluster_offset
861 * 925 *
@@ -872,13 +936,12 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs, @@ -872,13 +936,12 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
872 static uint64_t alloc_cluster_offset(BlockDriverState *bs, 936 static uint64_t alloc_cluster_offset(BlockDriverState *bs,
873 uint64_t offset, 937 uint64_t offset,
874 int n_start, int n_end, 938 int n_start, int n_end,
875 - int *num) 939 + int *num, QCowL2Meta *m)
876 { 940 {
877 BDRVQcowState *s = bs->opaque; 941 BDRVQcowState *s = bs->opaque;
878 int l2_index, ret; 942 int l2_index, ret;
879 uint64_t l2_offset, *l2_table, cluster_offset; 943 uint64_t l2_offset, *l2_table, cluster_offset;
880 - int nb_available, nb_clusters, i = 0;  
881 - uint64_t start_sect; 944 + int nb_clusters, i = 0;
882 945
883 ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); 946 ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
884 if (ret == 0) 947 if (ret == 0)
@@ -886,8 +949,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, @@ -886,8 +949,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
886 949
887 nb_clusters = size_to_clusters(s, n_end << 9); 950 nb_clusters = size_to_clusters(s, n_end << 9);
888 951
889 - if (nb_clusters > s->l2_size - l2_index)  
890 - nb_clusters = s->l2_size - l2_index; 952 + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
891 953
892 cluster_offset = be64_to_cpu(l2_table[l2_index]); 954 cluster_offset = be64_to_cpu(l2_table[l2_index]);
893 955
@@ -897,11 +959,8 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, @@ -897,11 +959,8 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
897 nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size, 959 nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
898 &l2_table[l2_index], 0); 960 &l2_table[l2_index], 0);
899 961
900 - nb_available = nb_clusters << (s->cluster_bits - 9);  
901 - if (nb_available > n_end)  
902 - nb_available = n_end;  
903 -  
904 cluster_offset &= ~QCOW_OFLAG_COPIED; 962 cluster_offset &= ~QCOW_OFLAG_COPIED;
  963 + m->nb_clusters = 0;
905 964
906 goto out; 965 goto out;
907 } 966 }
@@ -914,7 +973,6 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, @@ -914,7 +973,6 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
914 /* how many available clusters ? */ 973 /* how many available clusters ? */
915 974
916 while (i < nb_clusters) { 975 while (i < nb_clusters) {
917 - int j;  
918 i += count_contiguous_free_clusters(nb_clusters - i, 976 i += count_contiguous_free_clusters(nb_clusters - i,
919 &l2_table[l2_index + i]); 977 &l2_table[l2_index + i]);
920 978
@@ -924,14 +982,9 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, @@ -924,14 +982,9 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
924 (cluster_offset & QCOW_OFLAG_COMPRESSED)) 982 (cluster_offset & QCOW_OFLAG_COMPRESSED))
925 break; 983 break;
926 984
927 - j = count_contiguous_clusters(nb_clusters - i, s->cluster_size, 985 + i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
928 &l2_table[l2_index + i], 0); 986 &l2_table[l2_index + i], 0);
929 987
930 - if (j)  
931 - free_any_clusters(bs, cluster_offset, j);  
932 -  
933 - i += j;  
934 -  
935 if(be64_to_cpu(l2_table[l2_index + i])) 988 if(be64_to_cpu(l2_table[l2_index + i]))
936 break; 989 break;
937 } 990 }
@@ -941,48 +994,15 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, @@ -941,48 +994,15 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
941 994
942 cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size); 995 cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
943 996
944 - /* we must initialize the cluster content which won't be  
945 - written */  
946 -  
947 - nb_available = nb_clusters << (s->cluster_bits - 9);  
948 - if (nb_available > n_end)  
949 - nb_available = n_end;  
950 -  
951 - /* copy content of unmodified sectors */  
952 -  
953 - start_sect = (offset & ~(s->cluster_size - 1)) >> 9;  
954 - if (n_start) {  
955 - ret = copy_sectors(bs, start_sect, cluster_offset, 0, n_start);  
956 - if (ret < 0)  
957 - return 0;  
958 - }  
959 -  
960 - if (nb_available & (s->cluster_sectors - 1)) {  
961 - uint64_t end = nb_available & ~(uint64_t)(s->cluster_sectors - 1);  
962 - ret = copy_sectors(bs, start_sect + end,  
963 - cluster_offset + (end << 9),  
964 - nb_available - end,  
965 - s->cluster_sectors);  
966 - if (ret < 0)  
967 - return 0;  
968 - }  
969 -  
970 - /* update L2 table */  
971 -  
972 - for (i = 0; i < nb_clusters; i++)  
973 - l2_table[l2_index + i] = cpu_to_be64((cluster_offset +  
974 - (i << s->cluster_bits)) |  
975 - QCOW_OFLAG_COPIED);  
976 -  
977 - if (bdrv_pwrite(s->hd,  
978 - l2_offset + l2_index * sizeof(uint64_t),  
979 - l2_table + l2_index,  
980 - nb_clusters * sizeof(uint64_t)) !=  
981 - nb_clusters * sizeof(uint64_t))  
982 - return 0; 997 + /* save info needed for meta data update */
  998 + m->offset = offset;
  999 + m->n_start = n_start;
  1000 + m->nb_clusters = nb_clusters;
983 1001
984 out: 1002 out:
985 - *num = nb_available - n_start; 1003 + m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
  1004 +
  1005 + *num = m->nb_available - n_start;
986 1006
987 return cluster_offset; 1007 return cluster_offset;
988 } 1008 }
@@ -1113,6 +1133,7 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num, @@ -1113,6 +1133,7 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1113 int ret, index_in_cluster, n; 1133 int ret, index_in_cluster, n;
1114 uint64_t cluster_offset; 1134 uint64_t cluster_offset;
1115 int n_end; 1135 int n_end;
  1136 + QCowL2Meta l2meta;
1116 1137
1117 while (nb_sectors > 0) { 1138 while (nb_sectors > 0) {
1118 index_in_cluster = sector_num & (s->cluster_sectors - 1); 1139 index_in_cluster = sector_num & (s->cluster_sectors - 1);
@@ -1122,7 +1143,7 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num, @@ -1122,7 +1143,7 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1122 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; 1143 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1123 cluster_offset = alloc_cluster_offset(bs, sector_num << 9, 1144 cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1124 index_in_cluster, 1145 index_in_cluster,
1125 - n_end, &n); 1146 + n_end, &n, &l2meta);
1126 if (!cluster_offset) 1147 if (!cluster_offset)
1127 return -1; 1148 return -1;
1128 if (s->crypt_method) { 1149 if (s->crypt_method) {
@@ -1133,8 +1154,10 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num, @@ -1133,8 +1154,10 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1133 } else { 1154 } else {
1134 ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); 1155 ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1135 } 1156 }
1136 - if (ret != n * 512) 1157 + if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
  1158 + free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1137 return -1; 1159 return -1;
  1160 + }
1138 nb_sectors -= n; 1161 nb_sectors -= n;
1139 sector_num += n; 1162 sector_num += n;
1140 buf += n * 512; 1163 buf += n * 512;
@@ -1153,6 +1176,7 @@ typedef struct QCowAIOCB { @@ -1153,6 +1176,7 @@ typedef struct QCowAIOCB {
1153 uint8_t *cluster_data; 1176 uint8_t *cluster_data;
1154 BlockDriverAIOCB *hd_aiocb; 1177 BlockDriverAIOCB *hd_aiocb;
1155 QEMUBH *bh; 1178 QEMUBH *bh;
  1179 + QCowL2Meta l2meta;
1156 } QCowAIOCB; 1180 } QCowAIOCB;
1157 1181
1158 static void qcow_aio_read_cb(void *opaque, int ret); 1182 static void qcow_aio_read_cb(void *opaque, int ret);
@@ -1281,6 +1305,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, @@ -1281,6 +1305,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1281 acb->nb_sectors = nb_sectors; 1305 acb->nb_sectors = nb_sectors;
1282 acb->n = 0; 1306 acb->n = 0;
1283 acb->cluster_offset = 0; 1307 acb->cluster_offset = 0;
  1308 + acb->l2meta.nb_clusters = 0;
1284 return acb; 1309 return acb;
1285 } 1310 }
1286 1311
@@ -1304,7 +1329,6 @@ static void qcow_aio_write_cb(void *opaque, int ret) @@ -1304,7 +1329,6 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1304 BlockDriverState *bs = acb->common.bs; 1329 BlockDriverState *bs = acb->common.bs;
1305 BDRVQcowState *s = bs->opaque; 1330 BDRVQcowState *s = bs->opaque;
1306 int index_in_cluster; 1331 int index_in_cluster;
1307 - uint64_t cluster_offset;  
1308 const uint8_t *src_buf; 1332 const uint8_t *src_buf;
1309 int n_end; 1333 int n_end;
1310 1334
@@ -1317,6 +1341,11 @@ static void qcow_aio_write_cb(void *opaque, int ret) @@ -1317,6 +1341,11 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1317 return; 1341 return;
1318 } 1342 }
1319 1343
  1344 + if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
  1345 + free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
  1346 + goto fail;
  1347 + }
  1348 +
1320 acb->nb_sectors -= acb->n; 1349 acb->nb_sectors -= acb->n;
1321 acb->sector_num += acb->n; 1350 acb->sector_num += acb->n;
1322 acb->buf += acb->n * 512; 1351 acb->buf += acb->n * 512;
@@ -1334,10 +1363,10 @@ static void qcow_aio_write_cb(void *opaque, int ret) @@ -1334,10 +1363,10 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1334 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) 1363 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1335 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; 1364 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1336 1365
1337 - cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9, 1366 + acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1338 index_in_cluster, 1367 index_in_cluster,
1339 - n_end, &acb->n);  
1340 - if (!cluster_offset || (cluster_offset & 511) != 0) { 1368 + n_end, &acb->n, &acb->l2meta);
  1369 + if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1341 ret = -EIO; 1370 ret = -EIO;
1342 goto fail; 1371 goto fail;
1343 } 1372 }
@@ -1357,7 +1386,7 @@ static void qcow_aio_write_cb(void *opaque, int ret) @@ -1357,7 +1386,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1357 src_buf = acb->buf; 1386 src_buf = acb->buf;
1358 } 1387 }
1359 acb->hd_aiocb = bdrv_aio_write(s->hd, 1388 acb->hd_aiocb = bdrv_aio_write(s->hd,
1360 - (cluster_offset >> 9) + index_in_cluster, 1389 + (acb->cluster_offset >> 9) + index_in_cluster,
1361 src_buf, acb->n, 1390 src_buf, acb->n,
1362 qcow_aio_write_cb, acb); 1391 qcow_aio_write_cb, acb);
1363 if (acb->hd_aiocb == NULL) 1392 if (acb->hd_aiocb == NULL)