Commit e976c6a1e40ad74d616a186d3b48b0ad8f5eb970

Authored by aliguori
1 parent 6db6c638

Change order of metadata update to prevent loosing guest data because of unexpec…

…ted exit (Gleb Natapov)

Currently the order is this (during cow since it's the interesting case):
1. Decrement refcount of old clusters
2. Increment refcount for newly allocated clusters
3. Copy content of old sectors that will not be rewritten
4. Update L2 table with pointers to new clusters
5. Write guest data into new clusters (asynchronously)

There are several problems with this order. The first one is that if qemu
crashes (or killed or host reboots) after new clusters are linked into L2
table but before user data is written there, then on the next reboot guest
will find neither old data nor new one in those sectors and this is not
what gust expects even when journaling file system is in use.  The other
problem is that if qemu is killed between steps 1 and 4 then refcount
of old cluster will be incorrect and may cause snapshot corruption.

The patch change the order to be like this:
1. Increment refcount for newly allocated clusters
2. Write guest data into new clusters (asynchronously)
3. Copy content of old sectors that were not rewritten
4. Update L2 table with pointers to new clusters
5. Decrement refcount of old clusters

Unexpected crash may cause cluster leakage, but guest data should be safe.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>



git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@5861 c046a42c-6fe2-441c-8c8c-71466251a162
Showing 1 changed file with 92 additions and 63 deletions
block-qcow2.c
... ... @@ -856,6 +856,70 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
856 856 return cluster_offset;
857 857 }
858 858  
  859 +typedef struct QCowL2Meta
  860 +{
  861 + uint64_t offset;
  862 + int n_start;
  863 + int nb_available;
  864 + int nb_clusters;
  865 +} QCowL2Meta;
  866 +
  867 +static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
  868 + QCowL2Meta *m)
  869 +{
  870 + BDRVQcowState *s = bs->opaque;
  871 + int i, j = 0, l2_index, ret;
  872 + uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
  873 +
  874 + if (m->nb_clusters == 0)
  875 + return 0;
  876 +
  877 + if (!(old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t))))
  878 + return -ENOMEM;
  879 +
  880 + /* copy content of unmodified sectors */
  881 + start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
  882 + if (m->n_start) {
  883 + ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
  884 + if (ret < 0)
  885 + goto err;
  886 + }
  887 +
  888 + if (m->nb_available & (s->cluster_sectors - 1)) {
  889 + uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
  890 + ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
  891 + m->nb_available - end, s->cluster_sectors);
  892 + if (ret < 0)
  893 + goto err;
  894 + }
  895 +
  896 + ret = -EIO;
  897 + /* update L2 table */
  898 + if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
  899 + goto err;
  900 +
  901 + for (i = 0; i < m->nb_clusters; i++) {
  902 + if(l2_table[l2_index + i] != 0)
  903 + old_cluster[j++] = l2_table[l2_index + i];
  904 +
  905 + l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
  906 + (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
  907 + }
  908 +
  909 + if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
  910 + l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
  911 + m->nb_clusters * sizeof(uint64_t))
  912 + goto err;
  913 +
  914 + for (i = 0; i < j; i++)
  915 + free_any_clusters(bs, old_cluster[i], 1);
  916 +
  917 + ret = 0;
  918 +err:
  919 + qemu_free(old_cluster);
  920 + return ret;
  921 + }
  922 +
859 923 /*
860 924 * alloc_cluster_offset
861 925 *
... ... @@ -872,13 +936,12 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
872 936 static uint64_t alloc_cluster_offset(BlockDriverState *bs,
873 937 uint64_t offset,
874 938 int n_start, int n_end,
875   - int *num)
  939 + int *num, QCowL2Meta *m)
876 940 {
877 941 BDRVQcowState *s = bs->opaque;
878 942 int l2_index, ret;
879 943 uint64_t l2_offset, *l2_table, cluster_offset;
880   - int nb_available, nb_clusters, i = 0;
881   - uint64_t start_sect;
  944 + int nb_clusters, i = 0;
882 945  
883 946 ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
884 947 if (ret == 0)
... ... @@ -886,8 +949,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
886 949  
887 950 nb_clusters = size_to_clusters(s, n_end << 9);
888 951  
889   - if (nb_clusters > s->l2_size - l2_index)
890   - nb_clusters = s->l2_size - l2_index;
  952 + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
891 953  
892 954 cluster_offset = be64_to_cpu(l2_table[l2_index]);
893 955  
... ... @@ -897,11 +959,8 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
897 959 nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
898 960 &l2_table[l2_index], 0);
899 961  
900   - nb_available = nb_clusters << (s->cluster_bits - 9);
901   - if (nb_available > n_end)
902   - nb_available = n_end;
903   -
904 962 cluster_offset &= ~QCOW_OFLAG_COPIED;
  963 + m->nb_clusters = 0;
905 964  
906 965 goto out;
907 966 }
... ... @@ -914,7 +973,6 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
914 973 /* how many available clusters ? */
915 974  
916 975 while (i < nb_clusters) {
917   - int j;
918 976 i += count_contiguous_free_clusters(nb_clusters - i,
919 977 &l2_table[l2_index + i]);
920 978  
... ... @@ -924,14 +982,9 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
924 982 (cluster_offset & QCOW_OFLAG_COMPRESSED))
925 983 break;
926 984  
927   - j = count_contiguous_clusters(nb_clusters - i, s->cluster_size,
  985 + i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
928 986 &l2_table[l2_index + i], 0);
929 987  
930   - if (j)
931   - free_any_clusters(bs, cluster_offset, j);
932   -
933   - i += j;
934   -
935 988 if(be64_to_cpu(l2_table[l2_index + i]))
936 989 break;
937 990 }
... ... @@ -941,48 +994,15 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
941 994  
942 995 cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
943 996  
944   - /* we must initialize the cluster content which won't be
945   - written */
946   -
947   - nb_available = nb_clusters << (s->cluster_bits - 9);
948   - if (nb_available > n_end)
949   - nb_available = n_end;
950   -
951   - /* copy content of unmodified sectors */
952   -
953   - start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
954   - if (n_start) {
955   - ret = copy_sectors(bs, start_sect, cluster_offset, 0, n_start);
956   - if (ret < 0)
957   - return 0;
958   - }
959   -
960   - if (nb_available & (s->cluster_sectors - 1)) {
961   - uint64_t end = nb_available & ~(uint64_t)(s->cluster_sectors - 1);
962   - ret = copy_sectors(bs, start_sect + end,
963   - cluster_offset + (end << 9),
964   - nb_available - end,
965   - s->cluster_sectors);
966   - if (ret < 0)
967   - return 0;
968   - }
969   -
970   - /* update L2 table */
971   -
972   - for (i = 0; i < nb_clusters; i++)
973   - l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
974   - (i << s->cluster_bits)) |
975   - QCOW_OFLAG_COPIED);
976   -
977   - if (bdrv_pwrite(s->hd,
978   - l2_offset + l2_index * sizeof(uint64_t),
979   - l2_table + l2_index,
980   - nb_clusters * sizeof(uint64_t)) !=
981   - nb_clusters * sizeof(uint64_t))
982   - return 0;
  997 + /* save info needed for meta data update */
  998 + m->offset = offset;
  999 + m->n_start = n_start;
  1000 + m->nb_clusters = nb_clusters;
983 1001  
984 1002 out:
985   - *num = nb_available - n_start;
  1003 + m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
  1004 +
  1005 + *num = m->nb_available - n_start;
986 1006  
987 1007 return cluster_offset;
988 1008 }
... ... @@ -1113,6 +1133,7 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1113 1133 int ret, index_in_cluster, n;
1114 1134 uint64_t cluster_offset;
1115 1135 int n_end;
  1136 + QCowL2Meta l2meta;
1116 1137  
1117 1138 while (nb_sectors > 0) {
1118 1139 index_in_cluster = sector_num & (s->cluster_sectors - 1);
... ... @@ -1122,7 +1143,7 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1122 1143 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1123 1144 cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
1124 1145 index_in_cluster,
1125   - n_end, &n);
  1146 + n_end, &n, &l2meta);
1126 1147 if (!cluster_offset)
1127 1148 return -1;
1128 1149 if (s->crypt_method) {
... ... @@ -1133,8 +1154,10 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num,
1133 1154 } else {
1134 1155 ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
1135 1156 }
1136   - if (ret != n * 512)
  1157 + if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
  1158 + free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
1137 1159 return -1;
  1160 + }
1138 1161 nb_sectors -= n;
1139 1162 sector_num += n;
1140 1163 buf += n * 512;
... ... @@ -1153,6 +1176,7 @@ typedef struct QCowAIOCB {
1153 1176 uint8_t *cluster_data;
1154 1177 BlockDriverAIOCB *hd_aiocb;
1155 1178 QEMUBH *bh;
  1179 + QCowL2Meta l2meta;
1156 1180 } QCowAIOCB;
1157 1181  
1158 1182 static void qcow_aio_read_cb(void *opaque, int ret);
... ... @@ -1281,6 +1305,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
1281 1305 acb->nb_sectors = nb_sectors;
1282 1306 acb->n = 0;
1283 1307 acb->cluster_offset = 0;
  1308 + acb->l2meta.nb_clusters = 0;
1284 1309 return acb;
1285 1310 }
1286 1311  
... ... @@ -1304,7 +1329,6 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1304 1329 BlockDriverState *bs = acb->common.bs;
1305 1330 BDRVQcowState *s = bs->opaque;
1306 1331 int index_in_cluster;
1307   - uint64_t cluster_offset;
1308 1332 const uint8_t *src_buf;
1309 1333 int n_end;
1310 1334  
... ... @@ -1317,6 +1341,11 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1317 1341 return;
1318 1342 }
1319 1343  
  1344 + if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
  1345 + free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
  1346 + goto fail;
  1347 + }
  1348 +
1320 1349 acb->nb_sectors -= acb->n;
1321 1350 acb->sector_num += acb->n;
1322 1351 acb->buf += acb->n * 512;
... ... @@ -1334,10 +1363,10 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1334 1363 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
1335 1364 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1336 1365  
1337   - cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
  1366 + acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
1338 1367 index_in_cluster,
1339   - n_end, &acb->n);
1340   - if (!cluster_offset || (cluster_offset & 511) != 0) {
  1368 + n_end, &acb->n, &acb->l2meta);
  1369 + if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
1341 1370 ret = -EIO;
1342 1371 goto fail;
1343 1372 }
... ... @@ -1357,7 +1386,7 @@ static void qcow_aio_write_cb(void *opaque, int ret)
1357 1386 src_buf = acb->buf;
1358 1387 }
1359 1388 acb->hd_aiocb = bdrv_aio_write(s->hd,
1360   - (cluster_offset >> 9) + index_in_cluster,
  1389 + (acb->cluster_offset >> 9) + index_in_cluster,
1361 1390 src_buf, acb->n,
1362 1391 qcow_aio_write_cb, acb);
1363 1392 if (acb->hd_aiocb == NULL)
... ...