diff options
-rw-r--r-- | block.c | 12 | ||||
-rw-r--r-- | block/file-posix.c | 2 | ||||
-rw-r--r-- | block/qcow2-refcount.c | 20 | ||||
-rw-r--r-- | block/qcow2.h | 6 | ||||
-rw-r--r-- | block/vvfat.c | 6 | ||||
-rw-r--r-- | docs/interop/qcow2.txt | 38 | ||||
-rw-r--r-- | hw/block/fdc.c | 2 | ||||
-rw-r--r-- | hw/block/nvme.c | 7 | ||||
-rwxr-xr-x | tests/qemu-iotests/182 | 71 | ||||
-rw-r--r-- | tests/qemu-iotests/182.out | 9 | ||||
-rwxr-xr-x | tests/qemu-iotests/220 | 96 | ||||
-rw-r--r-- | tests/qemu-iotests/220.out | 54 | ||||
-rw-r--r-- | tests/qemu-iotests/group | 1 |
13 files changed, 310 insertions, 14 deletions
@@ -3201,6 +3201,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, QDict *orig_reopen_opts; char *discard = NULL; bool read_only; + bool drv_prepared = false; assert(reopen_state != NULL); assert(reopen_state->bs->drv != NULL); @@ -3285,6 +3286,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, goto error; } + drv_prepared = true; + /* Options that are not handled are only okay if they are unchanged * compared to the old state. It is expected that some options are only * used for the initial open, but not reopen (e.g. filename) */ @@ -3350,6 +3353,15 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, reopen_state->options = qobject_ref(orig_reopen_opts); error: + if (ret < 0 && drv_prepared) { + /* drv->bdrv_reopen_prepare() has succeeded, so we need to + * call drv->bdrv_reopen_abort() before signaling an error + * (bdrv_reopen_multiple() will not call bdrv_reopen_abort() + * when the respective bdrv_reopen_prepare() has failed) */ + if (drv->bdrv_reopen_abort) { + drv->bdrv_reopen_abort(reopen_state); + } + } qemu_opts_del(opts); qobject_unref(orig_reopen_opts); g_free(discard); diff --git a/block/file-posix.c b/block/file-posix.c index 58c86a01ea..07bbdab953 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -959,7 +959,7 @@ static void raw_reopen_commit(BDRVReopenState *state) /* Copy locks to the new fd before closing the old one. */ raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm, - ~s->locked_shared_perm, false, &local_err); + s->locked_shared_perm, false, &local_err); if (local_err) { /* shouldn't fail in a sane host, but report it just in case. */ error_report_err(local_err); diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 46082aeac1..1c63ac244a 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -31,7 +31,8 @@ #include "qemu/bswap.h" #include "qemu/cutils.h" -static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size, + uint64_t max); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, uint64_t addend, bool decrease, enum qcow2_discard_type type); @@ -362,7 +363,7 @@ static int alloc_refcount_block(BlockDriverState *bs, } /* Allocate the refcount block itself and mark it as used */ - int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); + int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX); if (new_block < 0) { return new_block; } @@ -954,7 +955,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, /* return < 0 if error */ -static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size, + uint64_t max) { BDRVQcow2State *s = bs->opaque; uint64_t i, nb_clusters, refcount; @@ -979,9 +981,9 @@ retry: } /* Make sure that all offsets in the "allocated" range are representable - * in an int64_t */ + * in the requested max */ if (s->free_cluster_index > 0 && - s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) + s->free_cluster_index - 1 > (max >> s->cluster_bits)) { return -EFBIG; } @@ -1001,7 +1003,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); do { - offset = alloc_clusters_noref(bs, size); + offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET); if (offset < 0) { return offset; } @@ -1083,7 +1085,11 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); do { if (!offset || free_in_cluster < size) { - int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); + int64_t new_cluster; + + new_cluster = alloc_clusters_noref(bs, s->cluster_size, + MIN(s->cluster_offset_mask, + QCOW_MAX_CLUSTER_OFFSET)); if (new_cluster < 0) { return new_cluster; } diff --git a/block/qcow2.h b/block/qcow2.h index 29c98d87a0..8662b68575 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -42,6 +42,12 @@ #define QCOW_MAX_CRYPT_CLUSTERS 32 #define QCOW_MAX_SNAPSHOTS 65536 +/* Field widths in qcow2 mean normal cluster offsets cannot reach + * 64PB; depending on cluster size, compressed clusters can have a + * smaller limit (64PB for up to 16k clusters, then ramps down to + * 512TB for 2M clusters). */ +#define QCOW_MAX_CLUSTER_OFFSET ((1ULL << 56) - 1) + /* 8 MB refcount table is enough for 2 PB images at 64k cluster size * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ #define QCOW_MAX_REFTABLE_SIZE S_8MiB diff --git a/block/vvfat.c b/block/vvfat.c index 1de5de1db4..b7b61ea8b7 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -2510,7 +2510,7 @@ static int commit_one_file(BDRVVVFATState* s, uint32_t first_cluster = c; mapping_t* mapping = find_mapping_for_cluster(s, c); uint32_t size = filesize_of_direntry(direntry); - char* cluster = g_malloc(s->cluster_size); + char *cluster; uint32_t i; int fd = 0; @@ -2528,17 +2528,17 @@ static int commit_one_file(BDRVVVFATState* s, if (fd < 0) { fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path, strerror(errno), errno); - g_free(cluster); return fd; } if (offset > 0) { if (lseek(fd, offset, SEEK_SET) != offset) { qemu_close(fd); - g_free(cluster); return -3; } } + cluster = g_malloc(s->cluster_size); + while (offset < size) { uint32_t c1; int rest_size = (size - offset > s->cluster_size ? diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index 845d40a086..fb5cb47245 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -40,7 +40,18 @@ The first cluster of a qcow2 image contains the file header: with larger cluster sizes. 24 - 31: size - Virtual disk size in bytes + Virtual disk size in bytes. + + Note: qemu has an implementation limit of 32 MB as + the maximum L1 table size. With a 2 MB cluster + size, it is unable to populate a virtual cluster + beyond 2 EB (61 bits); with a 512 byte cluster + size, it is unable to populate a virtual size + larger than 128 GB (37 bits). Meanwhile, L1/L2 + table layouts limit an image to no more than 64 PB + (56 bits) of populated clusters, and an image may + hit other limits first (such as a file system's + maximum size). 32 - 35: crypt_method 0 for no encryption @@ -326,6 +337,17 @@ in the image file. It contains pointers to the second level structures which are called refcount blocks and are exactly one cluster in size. +Although a large enough refcount table can reserve clusters past 64 PB +(56 bits) (assuming the underlying protocol can even be sized that +large), note that some qcow2 metadata such as L1/L2 tables must point +to clusters prior to that point. + +Note: qemu has an implementation limit of 8 MB as the maximum refcount +table size. With a 2 MB cluster size and a default refcount_order of +4, it is unable to reference host resources beyond 2 EB (61 bits); in +the worst case, with a 512 cluster size and refcount_order of 6, it is +unable to access beyond 32 GB (35 bits). + Given an offset into the image file, the refcount of its cluster can be obtained as follows: @@ -365,6 +387,16 @@ The L1 table has a variable size (stored in the header) and may use multiple clusters, however it must be contiguous in the image file. L2 tables are exactly one cluster in size. +The L1 and L2 tables have implications on the maximum virtual file +size; for a given L1 table size, a larger cluster size is required for +the guest to have access to more space. Furthermore, a virtual +cluster must currently map to a host offset below 64 PB (56 bits) +(although this limit could be relaxed by putting reserved bits into +use). Additionally, as cluster size increases, the maximum host +offset for a compressed cluster is reduced (a 2M cluster size requires +compressed clusters to reside below 512 TB (49 bits), and this limit +cannot be relaxed without an incompatible layout change). + Given an offset into the virtual disk, the offset into the image file can be obtained as follows: @@ -427,7 +459,9 @@ Standard Cluster Descriptor: Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)): Bit 0 - x-1: Host cluster offset. This is usually _not_ aligned to a - cluster or sector boundary! + cluster or sector boundary! If cluster_bits is + small enough that this field includes bits beyond + 55, those upper bits must be set to 0. x - 61: Number of additional 512-byte sectors used for the compressed data, beyond the sector containing the offset diff --git a/hw/block/fdc.c b/hw/block/fdc.c index 2e9c1e1e2f..6f19f127a5 100644 --- a/hw/block/fdc.c +++ b/hw/block/fdc.c @@ -1617,7 +1617,7 @@ static void fdctrl_stop_transfer(FDCtrl *fdctrl, uint8_t status0, fdctrl->fifo[5] = cur_drv->sect; fdctrl->fifo[6] = FD_SECTOR_SC; fdctrl->data_dir = FD_DIR_READ; - if (!(fdctrl->msr & FD_MSR_NONDMA)) { + if (fdctrl->dma_chann != -1 && !(fdctrl->msr & FD_MSR_NONDMA)) { IsaDmaClass *k = ISADMA_GET_CLASS(fdctrl->dma); k->release_DREQ(fdctrl->dma, fdctrl->dma_chann); } diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 09d7c90259..d0226e7fdc 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1175,6 +1175,10 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; + + if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) { + return; + } memcpy(&n->cmbuf[addr], &data, size); } @@ -1183,6 +1187,9 @@ static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) uint64_t val; NvmeCtrl *n = (NvmeCtrl *)opaque; + if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) { + return 0; + } memcpy(&val, &n->cmbuf[addr], size); return val; } diff --git a/tests/qemu-iotests/182 b/tests/qemu-iotests/182 index 4b31592fb8..3b7689c1d5 100755 --- a/tests/qemu-iotests/182 +++ b/tests/qemu-iotests/182 @@ -31,6 +31,7 @@ status=1 # failure is the default! _cleanup() { _cleanup_test_img + rm -f "$TEST_IMG.overlay" } trap "_cleanup; exit \$status" 0 1 2 3 15 @@ -71,6 +72,76 @@ echo 'quit' | $QEMU -nographic -monitor stdio \ _cleanup_qemu +echo +echo '=== Testing reopen ===' +echo + +# This tests that reopening does not unshare any permissions it should +# not unshare +# (There was a bug where reopening shared exactly the opposite of the +# permissions it was supposed to share) + +_launch_qemu + +_send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'qmp_capabilities'}" \ + 'return' + +# Open the image without any format layer (we are not going to access +# it, so that is fine) +# This should keep all permissions shared. +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'blockdev-add', + 'arguments': { + 'node-name': 'node0', + 'driver': 'file', + 'filename': '$TEST_IMG', + 'locking': 'on' + } }" \ + 'return' \ + 'error' + +# This snapshot will perform a reopen to drop R/W to RO. +# It should still keep all permissions shared. +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'blockdev-snapshot-sync', + 'arguments': { + 'node-name': 'node0', + 'snapshot-file': '$TEST_IMG.overlay', + 'snapshot-node-name': 'node1' + } }" \ + 'return' \ + 'error' + +# Now open the same file again +# This does not require any permissions (and does not unshare any), so +# this will not conflict with node0. +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'blockdev-add', + 'arguments': { + 'node-name': 'node1', + 'driver': 'file', + 'filename': '$TEST_IMG', + 'locking': 'on' + } }" \ + 'return' \ + 'error' + +# Now we attach the image to a virtio-blk device. This device does +# require some permissions (at least WRITE and READ_CONSISTENT), so if +# reopening node0 unshared any (which it should not have), this will +# fail (but it should not). +success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'device_add', + 'arguments': { + 'driver': 'virtio-blk', + 'drive': 'node1' + } }" \ + 'return' \ + 'error' + +_cleanup_qemu + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/182.out b/tests/qemu-iotests/182.out index f1463c8862..af501ca3f3 100644 --- a/tests/qemu-iotests/182.out +++ b/tests/qemu-iotests/182.out @@ -5,4 +5,13 @@ Starting QEMU Starting a second QEMU using the same image should fail QEMU_PROG: -drive file=TEST_DIR/t.qcow2,if=none,id=drive0,file.locking=on: Failed to get "write" lock Is another process using the image [TEST_DIR/t.qcow2]? + +=== Testing reopen === + +{"return": {}} +{"return": {}} +Formatting 'TEST_DIR/t.qcow2.overlay', fmt=qcow2 size=197120 backing_file=TEST_DIR/t.qcow2 backing_fmt=file cluster_size=65536 lazy_refcounts=off refcount_bits=16 +{"return": {}} +{"return": {}} +{"return": {}} *** done diff --git a/tests/qemu-iotests/220 b/tests/qemu-iotests/220 new file mode 100755 index 0000000000..0c5682bda0 --- /dev/null +++ b/tests/qemu-iotests/220 @@ -0,0 +1,96 @@ +#!/bin/bash +# +# max limits on compression in huge qcow2 files +# +# Copyright (C) 2018 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +seq=$(basename $0) +echo "QA output created by $seq" + +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter +. ./common.pattern + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + +echo "== Creating huge file ==" + +# Sanity check: We require a file system that permits the creation +# of a HUGE (but very sparse) file. tmpfs works, ext4 does not. +if ! truncate --size=513T "$TEST_IMG"; then + _notrun "file system on $TEST_DIR does not support large enough files" +fi +rm "$TEST_IMG" +IMGOPTS='cluster_size=2M,refcount_bits=1' _make_test_img 513T + +echo "== Populating refcounts ==" +# We want an image with 256M refcounts * 2M clusters = 512T referenced. +# Each 2M cluster holds 16M refcounts; the refcount table initially uses +# 1 refblock, so we need to add 15 more. The refcount table lives at 2M, +# first refblock at 4M, L2 at 6M, so our remaining additions start at 8M. +# Then, for each refblock, mark it as fully populated. +to_hex() { + printf %016x\\n $1 | sed 's/\(..\)/\\x\1/g' +} +truncate --size=38m "$TEST_IMG" +entry=$((0x200000)) +$QEMU_IO_PROG -f raw -c "w -P 0xff 4m 2m" "$TEST_IMG" | _filter_qemu_io +for i in {1..15}; do + offs=$((0x600000 + i*0x200000)) + poke_file "$TEST_IMG" $((i*8 + entry)) $(to_hex $offs) + $QEMU_IO_PROG -f raw -c "w -P 0xff $offs 2m" "$TEST_IMG" | _filter_qemu_io +done + +echo "== Checking file before ==" +# FIXME: 'qemu-img check' doesn't diagnose refcounts beyond the end of +# the file as leaked clusters +_check_test_img 2>&1 | sed '/^Leaked cluster/d' +stat -c 'image size %s' "$TEST_IMG" + +echo "== Trying to write compressed cluster ==" +# Given our file size, the next available cluster at 512T lies beyond the +# maximum offset that a compressed 2M cluster can reside in +$QEMU_IO_PROG -c 'w -c 0 2m' "$TEST_IMG" | _filter_qemu_io +# The attempt failed, but ended up allocating a new refblock +stat -c 'image size %s' "$TEST_IMG" + +echo "== Writing normal cluster ==" +# The failed write should not corrupt the image, so a normal write succeeds +$QEMU_IO_PROG -c 'w 0 2m' "$TEST_IMG" | _filter_qemu_io + +echo "== Checking file after ==" +# qemu-img now sees the millions of leaked clusters, thanks to the allocations +# at 512T. Undo many of our faked references to speed up the check. +$QEMU_IO_PROG -f raw -c "w -z 5m 1m" -c "w -z 8m 30m" "$TEST_IMG" | + _filter_qemu_io +_check_test_img 2>&1 | sed '/^Leaked cluster/d' + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/220.out b/tests/qemu-iotests/220.out new file mode 100644 index 0000000000..af3021fd88 --- /dev/null +++ b/tests/qemu-iotests/220.out @@ -0,0 +1,54 @@ +QA output created by 220 +== Creating huge file == +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=564049465049088 +== Populating refcounts == +wrote 2097152/2097152 bytes at offset 4194304 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 8388608 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 10485760 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 12582912 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 14680064 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 16777216 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 18874368 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 20971520 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 23068672 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 25165824 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 27262976 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 29360128 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 31457280 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 33554432 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 35651584 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 37748736 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +== Checking file before == +No errors were found on the image. +image size 39845888 +== Trying to write compressed cluster == +write failed: Input/output error +image size 562949957615616 +== Writing normal cluster == +wrote 2097152/2097152 bytes at offset 0 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +== Checking file after == +wrote 1048576/1048576 bytes at offset 5242880 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 31457280/31457280 bytes at offset 8388608 +30 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +8388589 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index ebe4fe78bc..4d194716f2 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -219,6 +219,7 @@ 217 rw auto quick 218 rw auto quick 219 rw auto +220 rw auto 221 rw auto quick 222 rw auto quick 223 rw auto quick |