13 files changed, 310 insertions, 14 deletions
diff --git a/block.c b/block.c
index fd67e14dfa..3feac08535 100644
--- a/block.c
+++ b/block.c
@@ -3201,6 +3201,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
     QDict *orig_reopen_opts;
     char *discard = NULL;
     bool read_only;
+    bool drv_prepared = false;
 
     assert(reopen_state != NULL);
     assert(reopen_state->bs->drv != NULL);
@@ -3285,6 +3286,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
         goto error;
     }
 
+    drv_prepared = true;
+
     /* Options that are not handled are only okay if they are unchanged
      * compared to the old state. It is expected that some options are only
      * used for the initial open, but not reopen (e.g. filename) */
@@ -3350,6 +3353,15 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
     reopen_state->options = qobject_ref(orig_reopen_opts);
 
 error:
+    if (ret < 0 && drv_prepared) {
+        /* drv->bdrv_reopen_prepare() has succeeded, so we need to
+         * call drv->bdrv_reopen_abort() before signaling an error
+         * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
+         * when the respective bdrv_reopen_prepare() has failed) */
+        if (drv->bdrv_reopen_abort) {
+            drv->bdrv_reopen_abort(reopen_state);
+        }
+    }
     qemu_opts_del(opts);
     qobject_unref(orig_reopen_opts);
     g_free(discard);
diff --git a/block/file-posix.c b/block/file-posix.c
index 58c86a01ea..07bbdab953 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -959,7 +959,7 @@ static void raw_reopen_commit(BDRVReopenState *state)
 
     /* Copy locks to the new fd before closing the old one. */
     raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm,
-                         ~s->locked_shared_perm, false, &local_err);
+                         s->locked_shared_perm, false, &local_err);
     if (local_err) {
         /* shouldn't fail in a sane host, but report it just in case. */
         error_report_err(local_err);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 46082aeac1..1c63ac244a 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -31,7 +31,8 @@
 #include "qemu/bswap.h"
 #include "qemu/cutils.h"
 
-static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
+                                    uint64_t max);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int64_t offset, int64_t length, uint64_t addend,
                             bool decrease, enum qcow2_discard_type type);
@@ -362,7 +363,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     }
 
     /* Allocate the refcount block itself and mark it as used */
-    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
+    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
     if (new_block < 0) {
         return new_block;
     }
@@ -954,7 +955,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs,
 
 
 /* return < 0 if error */
-static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
+                                    uint64_t max)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t i, nb_clusters, refcount;
@@ -979,9 +981,9 @@ retry:
     }
 
     /* Make sure that all offsets in the "allocated" range are representable
-     * in an int64_t */
+     * in the requested max */
     if (s->free_cluster_index > 0 &&
-        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
+        s->free_cluster_index - 1 > (max >> s->cluster_bits))
     {
         return -EFBIG;
     }
@@ -1001,7 +1003,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
     do {
-        offset = alloc_clusters_noref(bs, size);
+        offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
         if (offset < 0) {
             return offset;
         }
@@ -1083,7 +1085,11 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
     do {
         if (!offset || free_in_cluster < size) {
-            int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
+            int64_t new_cluster;
+
+            new_cluster = alloc_clusters_noref(bs, s->cluster_size,
+                                               MIN(s->cluster_offset_mask,
+                                                   QCOW_MAX_CLUSTER_OFFSET));
             if (new_cluster < 0) {
                 return new_cluster;
             }
diff --git a/block/qcow2.h b/block/qcow2.h
index 29c98d87a0..8662b68575 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -42,6 +42,12 @@
 #define QCOW_MAX_CRYPT_CLUSTERS 32
 #define QCOW_MAX_SNAPSHOTS 65536
 
+/* Field widths in qcow2 mean normal cluster offsets cannot reach
+ * 64PB; depending on cluster size, compressed clusters can have a
+ * smaller limit (64PB for up to 16k clusters, then ramps down to
+ * 512TB for 2M clusters).  */
+#define QCOW_MAX_CLUSTER_OFFSET ((1ULL << 56) - 1)
+
 /* 8 MB refcount table is enough for 2 PB images at 64k cluster size
  * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
 #define QCOW_MAX_REFTABLE_SIZE S_8MiB
diff --git a/block/vvfat.c b/block/vvfat.c
index 1de5de1db4..b7b61ea8b7 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2510,7 +2510,7 @@ static int commit_one_file(BDRVVVFATState* s,
     uint32_t first_cluster = c;
     mapping_t* mapping = find_mapping_for_cluster(s, c);
     uint32_t size = filesize_of_direntry(direntry);
-    char* cluster = g_malloc(s->cluster_size);
+    char *cluster;
     uint32_t i;
     int fd = 0;
 
@@ -2528,17 +2528,17 @@ static int commit_one_file(BDRVVVFATState* s,
     if (fd < 0) {
         fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path,
                 strerror(errno), errno);
-        g_free(cluster);
         return fd;
     }
     if (offset > 0) {
         if (lseek(fd, offset, SEEK_SET) != offset) {
             qemu_close(fd);
-            g_free(cluster);
             return -3;
         }
     }
 
+    cluster = g_malloc(s->cluster_size);
+
     while (offset < size) {
         uint32_t c1;
         int rest_size = (size - offset > s->cluster_size ?
diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index 845d40a086..fb5cb47245 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -40,7 +40,18 @@ The first cluster of a qcow2 image contains the file header:
                     with larger cluster sizes.
 
          24 - 31:   size
-                    Virtual disk size in bytes
+                    Virtual disk size in bytes.
+
+                    Note: qemu has an implementation limit of 32 MB as
+                    the maximum L1 table size.  With a 2 MB cluster
+                    size, it is unable to populate a virtual cluster
+                    beyond 2 EB (61 bits); with a 512 byte cluster
+                    size, it is unable to populate a virtual size
+                    larger than 128 GB (37 bits).  Meanwhile, L1/L2
+                    table layouts limit an image to no more than 64 PB
+                    (56 bits) of populated clusters, and an image may
+                    hit other limits first (such as a file system's
+                    maximum size).
 
          32 - 35:   crypt_method
                     0 for no encryption
@@ -326,6 +337,17 @@ in the image file.
 It contains pointers to the second level structures which are called refcount
 blocks and are exactly one cluster in size.
 
+Although a large enough refcount table can reserve clusters past 64 PB
+(56 bits) (assuming the underlying protocol can even be sized that
+large), note that some qcow2 metadata such as L1/L2 tables must point
+to clusters prior to that point.
+
+Note: qemu has an implementation limit of 8 MB as the maximum refcount
+table size.  With a 2 MB cluster size and a default refcount_order of
+4, it is unable to reference host resources beyond 2 EB (61 bits); in
+the worst case, with a 512 cluster size and refcount_order of 6, it is
+unable to access beyond 32 GB (35 bits).
+
 Given an offset into the image file, the refcount of its cluster can be
 obtained as follows:
 
@@ -365,6 +387,16 @@ The L1 table has a variable size (stored in the header) and may use multiple
 clusters, however it must be contiguous in the image file. L2 tables are
 exactly one cluster in size.
 
+The L1 and L2 tables have implications on the maximum virtual file
+size; for a given L1 table size, a larger cluster size is required for
+the guest to have access to more space.  Furthermore, a virtual
+cluster must currently map to a host offset below 64 PB (56 bits)
+(although this limit could be relaxed by putting reserved bits into
+use).  Additionally, as cluster size increases, the maximum host
+offset for a compressed cluster is reduced (a 2M cluster size requires
+compressed clusters to reside below 512 TB (49 bits), and this limit
+cannot be relaxed without an incompatible layout change).
+
 Given an offset into the virtual disk, the offset into the image file can be
 obtained as follows:
 
@@ -427,7 +459,9 @@ Standard Cluster Descriptor:
 Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)):
 
     Bit  0 - x-1:   Host cluster offset. This is usually _not_ aligned to a
-                    cluster or sector boundary!
+                    cluster or sector boundary!  If cluster_bits is
+                    small enough that this field includes bits beyond
+                    55, those upper bits must be set to 0.
 
          x - 61:    Number of additional 512-byte sectors used for the
                     compressed data, beyond the sector containing the offset
diff --git a/hw/block/fdc.c b/hw/block/fdc.c
index 2e9c1e1e2f..6f19f127a5 100644
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -1617,7 +1617,7 @@ static void fdctrl_stop_transfer(FDCtrl *fdctrl, uint8_t status0,
     fdctrl->fifo[5] = cur_drv->sect;
     fdctrl->fifo[6] = FD_SECTOR_SC;
     fdctrl->data_dir = FD_DIR_READ;
-    if (!(fdctrl->msr & FD_MSR_NONDMA)) {
+    if (fdctrl->dma_chann != -1 && !(fdctrl->msr & FD_MSR_NONDMA)) {
         IsaDmaClass *k = ISADMA_GET_CLASS(fdctrl->dma);
         k->release_DREQ(fdctrl->dma, fdctrl->dma_chann);
     }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 09d7c90259..d0226e7fdc 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1175,6 +1175,10 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
     unsigned size)
 {
     NvmeCtrl *n = (NvmeCtrl *)opaque;
+
+    if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) {
+        return;
+    }
     memcpy(&n->cmbuf[addr], &data, size);
 }
 
@@ -1183,6 +1187,9 @@ static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
     uint64_t val;
     NvmeCtrl *n = (NvmeCtrl *)opaque;
 
+    if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) {
+        return 0;
+    }
     memcpy(&val, &n->cmbuf[addr], size);
     return val;
 }
diff --git a/tests/qemu-iotests/182 b/tests/qemu-iotests/182
index 4b31592fb8..3b7689c1d5 100755
--- a/tests/qemu-iotests/182
+++ b/tests/qemu-iotests/182
@@ -31,6 +31,7 @@ status=1	# failure is the default!
 _cleanup()
 {
     _cleanup_test_img
+    rm -f "$TEST_IMG.overlay"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -71,6 +72,76 @@ echo 'quit' | $QEMU -nographic -monitor stdio \
 
 _cleanup_qemu
 
+echo
+echo '=== Testing reopen ==='
+echo
+
+# This tests that reopening does not unshare any permissions it should
+# not unshare
+# (There was a bug where reopening shared exactly the opposite of the
+# permissions it was supposed to share)
+
+_launch_qemu
+
+_send_qemu_cmd $QEMU_HANDLE \
+    "{'execute': 'qmp_capabilities'}" \
+    'return'
+
+# Open the image without any format layer (we are not going to access
+# it, so that is fine)
+# This should keep all permissions shared.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+    "{'execute': 'blockdev-add',
+      'arguments': {
+          'node-name': 'node0',
+          'driver': 'file',
+          'filename': '$TEST_IMG',
+          'locking': 'on'
+          } }" \
+    'return' \
+    'error'
+
+# This snapshot will perform a reopen to drop R/W to RO.
+# It should still keep all permissions shared.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+    "{'execute': 'blockdev-snapshot-sync',
+      'arguments': {
+          'node-name': 'node0',
+          'snapshot-file': '$TEST_IMG.overlay',
+          'snapshot-node-name': 'node1'
+      } }" \
+    'return' \
+    'error'
+
+# Now open the same file again
+# This does not require any permissions (and does not unshare any), so
+# this will not conflict with node0.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+    "{'execute': 'blockdev-add',
+      'arguments': {
+          'node-name': 'node1',
+          'driver': 'file',
+          'filename': '$TEST_IMG',
+          'locking': 'on'
+          } }" \
+    'return' \
+    'error'
+
+# Now we attach the image to a virtio-blk device.  This device does
+# require some permissions (at least WRITE and READ_CONSISTENT), so if
+# reopening node0 unshared any (which it should not have), this will
+# fail (but it should not).
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+    "{'execute': 'device_add',
+      'arguments': {
+          'driver': 'virtio-blk',
+          'drive': 'node1'
+      } }" \
+    'return' \
+    'error'
+
+_cleanup_qemu
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/182.out b/tests/qemu-iotests/182.out
index f1463c8862..af501ca3f3 100644
--- a/tests/qemu-iotests/182.out
+++ b/tests/qemu-iotests/182.out
@@ -5,4 +5,13 @@ Starting QEMU
 Starting a second QEMU using the same image should fail
 QEMU_PROG: -drive file=TEST_DIR/t.qcow2,if=none,id=drive0,file.locking=on: Failed to get "write" lock
 Is another process using the image [TEST_DIR/t.qcow2]?
+
+=== Testing reopen ===
+
+{"return": {}}
+{"return": {}}
+Formatting 'TEST_DIR/t.qcow2.overlay', fmt=qcow2 size=197120 backing_file=TEST_DIR/t.qcow2 backing_fmt=file cluster_size=65536 lazy_refcounts=off refcount_bits=16
+{"return": {}}
+{"return": {}}
+{"return": {}}
 *** done
diff --git a/tests/qemu-iotests/220 b/tests/qemu-iotests/220
new file mode 100755
index 0000000000..0c5682bda0
--- /dev/null
+++ b/tests/qemu-iotests/220
@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# max limits on compression in huge qcow2 files
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq=$(basename $0)
+echo "QA output created by $seq"
+
+status=1    # failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+echo "== Creating huge file =="
+
+# Sanity check: We require a file system that permits the creation
+# of a HUGE (but very sparse) file.  tmpfs works, ext4 does not.
+if ! truncate --size=513T "$TEST_IMG"; then
+    _notrun "file system on $TEST_DIR does not support large enough files"
+fi
+rm "$TEST_IMG"
+IMGOPTS='cluster_size=2M,refcount_bits=1' _make_test_img 513T
+
+echo "== Populating refcounts =="
+# We want an image with 256M refcounts * 2M clusters = 512T referenced.
+# Each 2M cluster holds 16M refcounts; the refcount table initially uses
+# 1 refblock, so we need to add 15 more.  The refcount table lives at 2M,
+# first refblock at 4M, L2 at 6M, so our remaining additions start at 8M.
+# Then, for each refblock, mark it as fully populated.
+to_hex() {
+    printf %016x\\n $1 | sed 's/\(..\)/\\x\1/g'
+}
+truncate --size=38m "$TEST_IMG"
+entry=$((0x200000))
+$QEMU_IO_PROG -f raw -c "w -P 0xff 4m 2m" "$TEST_IMG" | _filter_qemu_io
+for i in {1..15}; do
+    offs=$((0x600000 + i*0x200000))
+    poke_file "$TEST_IMG" $((i*8 + entry)) $(to_hex $offs)
+    $QEMU_IO_PROG -f raw -c "w -P 0xff $offs 2m" "$TEST_IMG" | _filter_qemu_io
+done
+
+echo "== Checking file before =="
+# FIXME: 'qemu-img check' doesn't diagnose refcounts beyond the end of
+# the file as leaked clusters
+_check_test_img 2>&1 | sed '/^Leaked cluster/d'
+stat -c 'image size %s' "$TEST_IMG"
+
+echo "== Trying to write compressed cluster =="
+# Given our file size, the next available cluster at 512T lies beyond the
+# maximum offset that a compressed 2M cluster can reside in
+$QEMU_IO_PROG -c 'w -c 0 2m' "$TEST_IMG" | _filter_qemu_io
+# The attempt failed, but ended up allocating a new refblock
+stat -c 'image size %s' "$TEST_IMG"
+
+echo "== Writing normal cluster =="
+# The failed write should not corrupt the image, so a normal write succeeds
+$QEMU_IO_PROG -c 'w 0 2m' "$TEST_IMG" | _filter_qemu_io
+
+echo "== Checking file after =="
+# qemu-img now sees the millions of leaked clusters, thanks to the allocations
+# at 512T.  Undo many of our faked references to speed up the check.
+$QEMU_IO_PROG -f raw -c "w -z 5m 1m" -c "w -z 8m 30m" "$TEST_IMG" |
+    _filter_qemu_io
+_check_test_img 2>&1 | sed '/^Leaked cluster/d'
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/220.out b/tests/qemu-iotests/220.out
new file mode 100644
index 0000000000..af3021fd88
--- /dev/null
+++ b/tests/qemu-iotests/220.out
@@ -0,0 +1,54 @@
+QA output created by 220
+== Creating huge file ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=564049465049088
+== Populating refcounts ==
+wrote 2097152/2097152 bytes at offset 4194304
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 8388608
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 10485760
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 12582912
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 14680064
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 16777216
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 18874368
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 20971520
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 23068672
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 25165824
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 27262976
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 29360128
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 31457280
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 33554432
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 35651584
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 2097152/2097152 bytes at offset 37748736
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+== Checking file before ==
+No errors were found on the image.
+image size 39845888
+== Trying to write compressed cluster ==
+write failed: Input/output error
+image size 562949957615616
+== Writing normal cluster ==
+wrote 2097152/2097152 bytes at offset 0
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+== Checking file after ==
+wrote 1048576/1048576 bytes at offset 5242880
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 31457280/31457280 bytes at offset 8388608
+30 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+8388589 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index ebe4fe78bc..4d194716f2 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -219,6 +219,7 @@
 217 rw auto quick
 218 rw auto quick
 219 rw auto
+220 rw auto
 221 rw auto quick
 222 rw auto quick
 223 rw auto quick